diff --git a/.github/ISSUE_TEMPLATE/---document-issue-.md b/.github/ISSUE_TEMPLATE/---document-issue-.md
index 7c464ac584bc87..ffc2fcd7817b64 100644
--- a/.github/ISSUE_TEMPLATE/---document-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---document-issue-.md
@@ -56,4 +56,4 @@ For example: no sample code; The sample code is not helpful; The sample code not
 For example:Chinese API in this doc is inconsistent with English API, including params, description, sample code, formula, etc.
 
 #### Other
-For example: The doc link is broken; The doc page is missing; Dead link in docs.
\ No newline at end of file
+For example: The doc link is broken; The doc page is missing; Dead link in docs.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 765d8fc1578565..30f9e3a3dcdd2c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.15)
+cmake_policy(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@@ -32,16 +33,19 @@ option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+# to develop some acl related functionality on x86
+option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
+option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU  AND WITH_ASCEND)
+if (WITH_GPU AND WITH_ASCEND)
     message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
-# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
-if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
-    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
-       "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/")
+if (WITH_GPU AND WITH_ROCM)
+    message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
 
 if(WITH_GPU AND NOT APPLE)
@@ -61,6 +65,9 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
 
 if(WIN32)
     option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
@@ -165,8 +172,6 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization and inference-lib generation" OFF)
 ################################ Internal Configurations #######################################
-option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"         OFF)
-option(WITH_RCCL        "Compile PaddlePaddle with RCCL support"          OFF)
 option(WITH_NV_JETSON   "Compile PaddlePaddle with NV JETSON"             OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
@@ -179,12 +184,14 @@ option(WITH_XBYAK       "Compile with xbyak support"                    ON)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
+option(WITH_HETERPS     "Compile with heterps"                          OFF})
 option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
+option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
 option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
@@ -302,9 +309,9 @@ endif(WITH_ROCM)
 
 if (NOT WITH_ROCM AND WITH_RCCL)
     MESSAGE(WARNING
-        "Disable RCCL when compiling without GPU. Force WITH_RCCL=OFF.")
-    set(WITH_NCCL OFF CACHE STRING
-        "Disable RCCL when compiling without GPU" FORCE)
+        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+    set(WITH_RCCL OFF CACHE STRING
+        "Disable RCCL when compiling without ROCM" FORCE)
 endif()
 
 if(WITH_RCCL)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 9c1bd52e7fb7df..bf1352d4e11479 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -82,6 +82,10 @@ if(WITH_ASCEND)
     add_definitions(-DPADDLE_WITH_ASCEND)
 endif()
 
+if(WITH_ASCEND_CL)
+    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+endif()
+
 if(WITH_XPU)
     message(STATUS "Compile with XPU!")
     add_definitions(-DPADDLE_WITH_XPU)
@@ -93,13 +97,18 @@ if(WITH_GPU)
 
     FIND_PACKAGE(CUDA REQUIRED)
 
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1)
+        message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile")
     endif()
 
     if(NOT CUDNN_FOUND)
         message(FATAL_ERROR "Paddle needs cudnn to compile")
     endif()
+
+    if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile")
+    endif()
+
     if(CUPTI_FOUND)
         include_directories(${CUPTI_INCLUDE_DIR})
         add_definitions(-DPADDLE_WITH_CUPTI)
@@ -164,6 +173,9 @@ if(WITH_PSCORE)
     add_definitions(-DPADDLE_WITH_PSCORE)
 endif()
 
+if(WITH_HETERPS)
+    add_definitions(-DPADDLE_WITH_HETERPS)
+endif()
 
 if(WITH_GRPC)
     add_definitions(-DPADDLE_WITH_GRPC)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index c4d1384312e3c9..7f2addb02d36dd 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,15 +6,9 @@ endif()
 if (WITH_NV_JETSON)
   add_definitions(-DWITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
-  set(paddle_known_gpu_archs7 "53")
-  set(paddle_known_gpu_archs8 "53 62")
-  set(paddle_known_gpu_archs9 "53 62")
   set(paddle_known_gpu_archs10 "53 62 72")
 else()
-  set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
-  set(paddle_known_gpu_archs7 "30 35 50 52")
-  set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
-  set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
   set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
 endif()
@@ -74,7 +68,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
   set(archs_name_default "Auto")
   list(APPEND archs_names "Auto")
 
@@ -108,6 +102,8 @@ function(select_nvcc_arch_flags out_variable)
     set(cuda_arch_bin "70")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
+    set(cuda_arch_bin "80")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -158,25 +154,7 @@ function(select_nvcc_arch_flags out_variable)
 endfunction()
 
 message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION})
-if (${CMAKE_CUDA_COMPILER_VERSION} LESS 7.0)
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 8.0) # CUDA 7.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 9.0) # CUDA 8.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
-  # warning for now.
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
+if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
@@ -206,14 +184,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")
 
-# Set C++11 support
+# Set C++14 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-if (NOT WIN32) # windows msvc2015 support c++11 natively.
-    # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
-  set(CMAKE_CUDA_STANDARD 11)
-endif(NOT WIN32)
+set(CMAKE_CUDA_STANDARD 14)
 
 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index d8d8f634e76b6b..c82847100abefa 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file)
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
             message(STATUS "Current cuDNN header is ${cudnn_header_file} "
-              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
+              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
         endif()
     endif()
 endmacro()
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index bcf0c0a0646fc3..bddd2023b437b1 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -12,50 +12,69 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
-
-SET(ASCEND_PROJECT       "extern_ascend")
-IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE)
-  SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}")
-SET(ASCEND_SOURCE_DIR    "${THIRD_PARTY_PATH}/ascend")
-SET(ASCEND_DOWNLOAD_DIR  "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}")
-SET(ASCEND_DST_DIR       "ascend")
-SET(ASCEND_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(ASCEND_INSTALL_DIR   ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR})
-SET(ASCEND_ROOT          ${ASCEND_INSTALL_DIR})
-SET(ASCEND_INC_DIR       ${ASCEND_ROOT}/include)
-SET(ASCEND_LIB_DIR       ${ASCEND_ROOT}/lib)
-SET(ASCEND_LIB           ${ASCEND_LIB_DIR}/libge_runner.so)
-SET(ASCEND_GRAPH_LIB           ${ASCEND_LIB_DIR}/libgraph.so)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${ASCEND_INC_DIR})
-FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(ASCEND)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n"
-  "        DESTINATION ${ASCEND_DST_DIR})\n")
-ExternalProject_Add(
-    ${ASCEND_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${ASCEND_SOURCE_DIR}
-    DOWNLOAD_DIR          ${ASCEND_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz
-                          && tar zxvf ${ASCEND_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT}
-)
-ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB})
-
-ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB})
-ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT})
 
+#NOTE: Logic is from
+# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
+if(DEFINED ENV{ASCEND_CUSTOM_PATH})
+    set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
+else()
+    set(ASCEND_DIR /usr/local/Ascend)
+endif()
+
+if(WITH_ASCEND)
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
+    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+  endif()
+
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+
+  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+endif()
+
+if(WITH_ASCEND_CL)
+  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+
+  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
+  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
+  set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+
+  message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
+  message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
+  INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
+
+  ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
+
+  ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
+  add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
+
+endif()
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 0eb590c42d0cb7..2d72b6eb56deaa 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -39,9 +39,9 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/
 ExternalProject_Add(
         extern_brpc
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        # TODO(gongwb): change to de newst repo when they changed.
+        # TODO(gongwb): change to de newst repo when they changed
         GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
-        GIT_TAG         "6d79e0b17f25107c35b705ea58d888083f59ff47"
+        GIT_TAG         "e203afb794caf027da0f1e0776443e7d20c0c28e"
         PREFIX          ${BRPC_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 5a755a816c332a..4619f9f7b7e34c 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -14,11 +14,11 @@
 
 include(ExternalProject)
 
-# update eigen to the commit id 4da2c6b1 on 03/19/2020
+# update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
 set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
-set(EIGEN_TAG        4da2c6b1974827b1999bab652a3d4703e1992d26)
+set(EIGEN_TAG        f612df273689a19d25b45ca4f8269463207c4fee)
 
 cache_third_party(extern_eigen3
     REPOSITORY    ${EIGEN_REPOSITORY}
@@ -27,47 +27,13 @@ cache_third_party(extern_eigen3
 
 if(WIN32)
     add_definitions(-DEIGEN_STRONG_INLINE=inline)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
-    # For Windows
-    # which will cause a compilation error in Tensor:74:
-    # "can not open file 'unistd.h'"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2)
-    # For VS2015
-    # which will cause a compilation error in TensorBlock.h:1028:
-    # "syntax error"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3)
-    set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y)
 elseif(LINUX)
-    # For gxx=4.8, __GXX_ABI_VERSION is less than 1004
-    # which will cause a compilation error in Geometry_SSE.h:38:
-    # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)"
-    # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
-    # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
-    # so use following patch to solve compilation error with different version of gcc.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1)
-    # The compiler fully support const expressions since c++14,
-    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
-    # add patch to avoid compilation error in c++11
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
     if(WITH_ROCM)
         # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
         # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3)
-        # For HIPCC Eigen::internal::scalar_sum_op<bool,bool> is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4)
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4})
-    else()
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
+        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
+        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst)
+        set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst})
     endif()
 endif()
 
@@ -82,7 +48,7 @@ ExternalProject_Add(
     PREFIX          ${EIGEN_PREFIX_DIR}
     SOURCE_DIR      ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND    ""
-    PATCH_COMMAND   ${EIGEN_PATCH_COMMAND}
+    PATCH_COMMAND     ${EIGEN_PATCH_COMMAND}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
     INSTALL_COMMAND   ""
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index ea7af315e1a690..e8db13a694f557 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -32,21 +32,39 @@ cache_third_party(extern_gloo
     TAG           ${GLOO_TAG}
     DIR           GLOO_SOURCE_DIR)
 
-ExternalProject_Add(
-    extern_gloo
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    "${GLOO_DOWNLOAD_CMD}"
-    PREFIX                "${GLOO_PREFIX_DIR}"
-    SOURCE_DIR            "${GLOO_SOURCE_DIR}"
-    UPDATE_COMMAND        ""
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
-        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
-        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
-)
+  if(WITH_ASCEND OR WITH_ASCEND_CL)
+  ExternalProject_Add(
+      extern_gloo
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      "${GLOO_DOWNLOAD_CMD}"
+      PREFIX                "${GLOO_PREFIX_DIR}"
+      SOURCE_DIR            "${GLOO_SOURCE_DIR}"
+      UPDATE_COMMAND        ""
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+          && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
+          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
+  )
+else()
+  ExternalProject_Add(
+      extern_gloo
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      "${GLOO_DOWNLOAD_CMD}"
+      PREFIX                "${GLOO_PREFIX_DIR}"
+      SOURCE_DIR            "${GLOO_SOURCE_DIR}"
+      UPDATE_COMMAND        ""
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+          && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
+  )
+endif()
 
 
 ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
deleted file mode 100644
index 536e95c1dc2a4f..00000000000000
--- a/cmake/external/grpc.cmake
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-include (ExternalProject)
-
-SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
-SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
-SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
-SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
-
-include(ProcessorCount)
-ProcessorCount(NUM_OF_PROCESSOR)
-
-IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
-  SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) 
-ELSE()
-  SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}")
-  SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}")
-  SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) 
-  SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS})
-ENDIF()
-
-# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
-ExternalProject_Add(
-    extern_grpc
-    DEPENDS protobuf zlib
-    # NOTE(wuyi):
-    # this package is generated by following steps:
-    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. git submodule update --init
-    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
-    #    checkout and clean other dirs under third_party
-    # 4. remove .git, and package the directory.
-    URL          http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x_paddle.tar.gz
-    URL_MD5      f5442d137ddccee252e194b1bc90f98c
-    PREFIX          ${GRPC_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    # NOTE(yuyang18):
-    # Disable -Werror, otherwise the compile will fail in MacOS.
-    # It seems that we cannot configure that by make command.
-    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
-    BUILD_COMMAND  ${BUILD_CMD}
-    INSTALL_COMMAND ${GRPC_INSTALL_CMD}
-)
-
-ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
-             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
-
-ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
-ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
-
-ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
-
-include_directories(${GRPC_INCLUDE_DIR})
-ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 884219d8dd81f3..fb1d4d9d56dcc6 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            72efa005effb49595933e033cc732f215ef0445a)
+SET(MKLDNN_TAG            f58682cd8bd0615f41d879f8afc8f1511ab42d24)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 40a27f506f3077..c108c05368c915 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,8 +198,16 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+else()
     SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
     SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+endif()
 
     cache_third_party(${TARGET_NAME}
         REPOSITORY    ${PROTOBUF_REPOSITORY}
@@ -234,7 +242,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1.0)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    SET(PROTOBUF_VERSION 3.8.0)
+else()
+    SET(PROTOBUF_VERSION 3.1.0)
+endif()
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 205e8d26d93ca1..f9cb3a9075a821 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -16,7 +16,11 @@ INCLUDE(ExternalProject)
 
 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
+else()
+    SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+endif()
 SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
 
 cache_third_party(extern_threadpool
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index b0ef575f643238..100b9153394690 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,11 +14,17 @@
 
 INCLUDE(ExternalProject)
 
+IF(WITH_ROCM)
+    add_definitions(-DWARPCTC_WITH_HIP)
+ENDIF()
+
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+# in case of low internet speed  
+#set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         cd828e5b6c3b953b82af73f7f44cddc393a20efa)
+set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
@@ -37,38 +43,77 @@ cache_third_party(extern_warpctc
     TAG          ${WARPCTC_TAG}
     DIR          WARPCTC_SOURCE_DIR)
 
-ExternalProject_Add(
-    extern_warpctc
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    "${WARPCTC_DOWNLOAD_CMD}"
-    PREFIX          ${WARPCTC_PREFIX_DIR}
-    SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-    #UPDATE_COMMAND  ""
-    PATCH_COMMAND   ""
-    BUILD_ALWAYS    1
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=$<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>
-                    -DCMAKE_C_FLAGS_DEBUG=$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
-                    -DCMAKE_C_FLAGS_RELEASE=$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS=$<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS_RELEASE=$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
-                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-                    -DWITH_GPU=${WITH_GPU}
-                    -DWITH_OMP=${USE_OMP}
-                    -DWITH_TORCH=OFF
-                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-                    -DBUILD_SHARED=ON
-                    -DBUILD_TESTS=OFF
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    ExternalProject_Add(
+        extern_warpctc
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        ${SHALLOW_CLONE}
+        "${WARPCTC_DOWNLOAD_CMD}"
+        PREFIX          ${WARPCTC_PREFIX_DIR}
+        SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
+        #UPDATE_COMMAND  ""
+        PATCH_COMMAND   ""
+        BUILD_ALWAYS    1
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                        -DWITH_GPU=${WITH_GPU}
+                        -DWITH_ROCM=${WITH_ROCM}
+                        -DWITH_OMP=${USE_OMP}
+                        -DWITH_TORCH=OFF
+                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                        -DBUILD_SHARED=ON
+                        -DBUILD_TESTS=OFF
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    )
+else()
+    ExternalProject_Add(
+        extern_warpctc
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        ${SHALLOW_CLONE}
+        "${WARPCTC_DOWNLOAD_CMD}"
+        PREFIX          ${WARPCTC_PREFIX_DIR}
+        SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
+        #UPDATE_COMMAND  ""
+        PATCH_COMMAND   ""
+        BUILD_ALWAYS    1
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_FLAGS=$<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>
+                        -DCMAKE_C_FLAGS_DEBUG=$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
+                        -DCMAKE_C_FLAGS_RELEASE=$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
+                        -DCMAKE_CXX_FLAGS=$<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>
+                        -DCMAKE_CXX_FLAGS_RELEASE=$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
+                        -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
+                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                        -DWITH_GPU=${WITH_GPU}
+                        -DWITH_ROCM=${WITH_ROCM}
+                        -DWITH_OMP=${USE_OMP}
+                        -DWITH_TORCH=OFF
+                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                        -DBUILD_SHARED=ON
+                        -DBUILD_TESTS=OFF
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    )
+endif()
+
+
 IF(WIN32)
     SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
             CACHE FILEPATH "Warp-ctc Library" FORCE)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index b5a3f0154745b9..f846623602ed79 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
   elseif(WITH_SUNWAY)
       SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
   else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE)
   endif()
 
   SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index e110524dd1abb8..a2ddad557c2956 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -4,10 +4,10 @@ include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 include(CheckTypeSize)
 
-function(CheckCompilerCXX11Flag)
+function(CheckCompilerCXX14Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
         elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
             message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
         endif()
@@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag)
                 message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
             endif()
         else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
             endif()
         endif()
     endif()
 endfunction()
 
-CheckCompilerCXX11Flag()
-if (WITH_GPU)
-    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
-    else()
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-    endif()
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-endif()
+CheckCompilerCXX14Flag()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 0343ff3cc292d9..7dac91e531e4cf 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -11,6 +11,7 @@ function(op_library TARGET)
     set(cu_cc_srcs)
     set(hip_cc_srcs)
     set(xpu_cc_srcs)
+    set(npu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(miopen_cu_cc_srcs)
     set(cudnn_cu_srcs)
@@ -20,6 +21,9 @@ function(op_library TARGET)
     set(mkldnn_cc_srcs)
     set(MKLDNN_FILE)
     set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
+    if (WITH_ASCEND_CL)
+      set(op_common_deps ${op_common_deps} npu_op_runner)
+    endif()
     # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
     set(options UNITY)
     set(oneValueArgs "")
@@ -85,6 +89,12 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
             endif()
         endif()
+        if(WITH_ASCEND_CL)
+            string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
+                list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
+            endif()
+        endif()
     else()
         foreach(src ${op_library_SRCS})
             if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
@@ -107,6 +117,8 @@ function(op_library TARGET)
                 list(APPEND cu_cc_srcs ${src})
             elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
+            elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
+                list(APPEND npu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
@@ -176,7 +188,7 @@ function(op_library TARGET)
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
             # Combine the cc source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
+            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
             if(TARGET ${UNITY_TARGET})
                 # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                 target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@@ -187,7 +199,7 @@ function(op_library TARGET)
             # Add alias library to handle dependencies.
             add_library(${TARGET} ALIAS ${UNITY_TARGET})
         else()
-            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
+            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
         endif()
     endif()
@@ -207,6 +219,7 @@ function(op_library TARGET)
     # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    set(ORIGINAL_TARGET ${TARGET})
     file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
     # [ \t\r\n]* is used for blank characters
@@ -239,8 +252,9 @@ function(op_library TARGET)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
+    list(LENGTH npu_cc_srcs npu_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
+        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -280,6 +294,26 @@ function(op_library TARGET)
     if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
+
+    if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
+        file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
+        # It is different from the logic above, becareful
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
+        # [ \t\r\n]* is used for blank characters
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")
+
+        if (one_npu_register STREQUAL "")
+            string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
+        else ()
+            string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
+            string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
+            # [ \t\r\n]+ is used for blank characters.
+            # Here we use '+' instead of '*' since it is a REPLACE operation.
+            string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
+        endif()
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
+    endif()
+
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
       # Append first implemented MKLDNN activation operator
@@ -330,6 +364,7 @@ function(register_operators)
     file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
     string(REPLACE "_mkldnn" "" OPS "${OPS}")
     string(REPLACE "_xpu" "" OPS "${OPS}")
+    string(REPLACE "_npu" "" OPS "${OPS}")
     string(REPLACE ".cc" "" OPS "${OPS}")
     list(REMOVE_DUPLICATES OPS)
     list(LENGTH register_operators_DEPS register_operators_DEPS_len)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 6488d29afc5f7f..81fa7d0dfa98f0 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -274,10 +274,15 @@ if(WITH_BOX_PS)
     list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)
 
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     include(external/ascend)
-    list(APPEND third_party_deps extern_ascend)
-endif (WITH_ASCEND)
+    if(WITH_ASCEND)
+        list(APPEND third_party_deps extern_ascend)
+    endif()
+    if(WITH_ASCEND_CL)
+        list(APPEND third_party_deps extern_ascend_cl)
+    endif()
+endif ()
 
 if (WITH_PSCORE)
     include(external/snappy)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index c18332d3b87316..dcff02a662e273 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,4 +9,3 @@ add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-add_subdirectory(train)
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 5a2d7a06201ba4..a2062d82c8130b 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -14,6 +14,7 @@ endif()
 add_subdirectory(table)
 add_subdirectory(service)
 add_subdirectory(test)
+add_subdirectory(index_dataset)
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc
index b638af49730dd4..9aafdd769ed4a0 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -146,41 +146,6 @@ void FleetWrapper::CreateClient2ClientConnection() {
       client2client_max_retry_);
 }
 
-std::future<int32_t> FleetWrapper::PullSparseVarsAsync(
-    const Scope& scope, const uint64_t table_id,
-    const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
-    std::vector<std::vector<float>>* fea_values, int fea_value_dim) {
-  fea_keys->clear();
-  fea_keys->resize(0);
-  fea_keys->reserve(MAX_FEASIGN_NUM);
-  for (auto name : var_names) {
-    Variable* var = scope.FindVar(name);
-    if (var == nullptr) {
-      continue;
-    }
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
-    int64_t* ids = tensor->data<int64_t>();
-    size_t len = tensor->numel();
-    for (auto i = 0u; i < len; ++i) {
-      if (ids[i] == 0u) {
-        continue;
-      }
-      fea_keys->push_back(static_cast<uint64_t>(ids[i]));
-    }
-  }
-  fea_values->resize(fea_keys->size() + 1);
-  for (auto& t : *fea_values) {
-    t.resize(fea_value_dim);
-  }
-  std::vector<float*> pull_result_ptr;
-  for (auto& t : *fea_values) {
-    pull_result_ptr.push_back(t.data());
-  }
-  return pserver_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
-}
-
 void FleetWrapper::PullSparseVarsSync(
     const Scope& scope, const uint64_t table_id,
     const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
@@ -224,8 +189,10 @@ void FleetWrapper::PullSparseVarsSync(
   for (auto& t : *fea_values) {
     pull_result_ptr.push_back(t.data());
   }
+  bool training = true;
   auto status = pserver_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
+      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size(),
+      training);
   pull_sparse_status.push_back(std::move(status));
   for (auto& t : pull_sparse_status) {
     t.wait();
@@ -238,9 +205,13 @@ void FleetWrapper::PullSparseVarsSync(
   }
 }
 
+// is_training is true means training, false means inference, the behavior is
+// different on pserver
+
 void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
                                           uint64_t padding_id,
                                           platform::Place place,
+                                          bool is_training,
                                           std::vector<const LoDTensor*>* inputs,
                                           std::vector<LoDTensor*>* outputs) {
   std::vector<uint64_t> fea_keys;
@@ -279,7 +250,8 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
   }
   auto* communicator = Communicator::GetInstance();
   auto status = communicator->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size());
+      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size(),
+      is_training);
   status.wait();
   auto ret = status.get();
   if (ret != 0) {
diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index ac566606ddcb40..863440180a808d 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -84,19 +84,14 @@ class FleetWrapper {
                           int fea_dim,
                           const std::vector<std::string>& var_emb_names);
 
-  // Pull sparse variables from server in async mode
-  // Param<in>: scope, table_id, var_names, fea_keys, fea_dim
-  // Param<out>: fea_values std::future
-  std::future<int32_t> PullSparseVarsAsync(
-      const Scope& scope, const uint64_t table_id,
-      const std::vector<std::string>& var_names,
-      std::vector<uint64_t>* fea_keys,
-      std::vector<std::vector<float>>* fea_values, int fea_dim);
-
   // Pull sparse variables from server in sync mode
   // pull immediately to tensors
+  // is_training is true means training, false means inference, the behavior is
+  // different on pserver
+
   void PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
                               uint64_t padding_id, platform::Place place,
+                              bool is_training,
                               std::vector<const LoDTensor*>* inputs,  // NOLINT
                               std::vector<LoDTensor*>* outputs);      // NOLINT
 
diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
new file mode 100644
index 00000000000000..a30488494a52bc
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
@@ -0,0 +1,7 @@
+proto_library(index_dataset_proto SRCS index_dataset.proto)
+cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs)
+cc_library(index_sampler SRCS index_sampler.cc DEPS index_wrapper)
+
+if(WITH_PYTHON)
+  py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto)
+endif()
diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/distributed/index_dataset/index_dataset.proto
similarity index 57%
rename from paddle/fluid/operators/distributed/distributed_pb.h
rename to paddle/fluid/distributed/index_dataset/index_dataset.proto
index f1c662be9af67b..1b4ee313671ad5 100644
--- a/paddle/fluid/operators/distributed/distributed_pb.h
+++ b/paddle/fluid/distributed/index_dataset/index_dataset.proto
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,19 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+syntax = "proto2";
+package paddle.distributed;
 
-#ifdef PADDLE_WITH_DISTRIBUTE
+message IndexNode {
+  required uint64 id = 1;
+  required bool is_leaf = 2;
+  required float probability = 3;
+}
 
-#ifdef PADDLE_WITH_GRPC
+message TreeMeta {
+  required int32 height = 1;
+  required int32 branch = 2;
+}
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#else  // PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#endif  // PADDLE_WITH_GRPC
-
-#endif  // PADDLE_WITH_DISTRIBUTE
+message KVItem {
+  required bytes key = 1;
+  required bytes value = 2;
+}
\ No newline at end of file
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc
new file mode 100644
index 00000000000000..58f85d98fb09c6
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+#include "paddle/fluid/operators/math/sampler.h"
+
+namespace paddle {
+namespace distributed {
+
+using Sampler = paddle::operators::math::Sampler;
+
+std::vector<std::vector<uint64_t>> LayerWiseSampler::sample(
+    const std::vector<std::vector<uint64_t>>& user_inputs,
+    const std::vector<uint64_t>& target_ids, bool with_hierarchy) {
+  auto input_num = target_ids.size();
+  auto user_feature_num = user_inputs[0].size();
+  std::vector<std::vector<uint64_t>> outputs(
+      input_num * layer_counts_sum_,
+      std::vector<uint64_t>(user_feature_num + 2));
+
+  auto max_layer = tree_->Height();
+  std::vector<Sampler*> sampler_vec(max_layer - start_sample_layer_);
+  std::vector<std::vector<IndexNode>> layer_ids(max_layer -
+                                                start_sample_layer_);
+
+  auto layer_index = max_layer - 1;
+  size_t idx = 0;
+  while (layer_index >= start_sample_layer_) {
+    auto layer_codes = tree_->GetLayerCodes(layer_index);
+    layer_ids[idx] = tree_->GetNodes(layer_codes);
+    sampler_vec[idx] = new paddle::operators::math::UniformSampler(
+        layer_ids[idx].size() - 1, seed_);
+    layer_index--;
+    idx++;
+  }
+
+  idx = 0;
+  for (size_t i = 0; i < input_num; i++) {
+    auto travel_codes =
+        tree_->GetTravelCodes(target_ids[i], start_sample_layer_);
+    auto travel_path = tree_->GetNodes(travel_codes);
+    for (size_t j = 0; j < travel_path.size(); j++) {
+      // user
+      if (j > 0 && with_hierarchy) {
+        auto ancestor_codes =
+            tree_->GetAncestorCodes(user_inputs[i], max_layer - j - 1);
+        auto hierarchical_user = tree_->GetNodes(ancestor_codes);
+        for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) {
+          for (size_t k = 0; k < user_feature_num; k++) {
+            outputs[idx + idx_offset][k] = hierarchical_user[k].id();
+          }
+        }
+      } else {
+        for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) {
+          for (size_t k = 0; k < user_feature_num; k++) {
+            outputs[idx + idx_offset][k] = user_inputs[i][k];
+          }
+        }
+      }
+
+      // sampler ++
+      outputs[idx][user_feature_num] = travel_path[j].id();
+      outputs[idx][user_feature_num + 1] = 1.0;
+      idx += 1;
+      for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) {
+        int sample_res = 0;
+        do {
+          sample_res = sampler_vec[j]->Sample();
+        } while (layer_ids[j][sample_res].id() == travel_path[j].id());
+        outputs[idx + idx_offset][user_feature_num] =
+            layer_ids[j][sample_res].id();
+        outputs[idx + idx_offset][user_feature_num + 1] = 0;
+      }
+      idx += layer_counts_[j];
+    }
+  }
+  for (size_t i = 0; i < sampler_vec.size(); i++) {
+    delete sampler_vec[i];
+  }
+  return outputs;
+}
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h
new file mode 100644
index 00000000000000..66882bedc9b765
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class IndexSampler {
+ public:
+  virtual ~IndexSampler() {}
+  IndexSampler() {}
+
+  template <typename T>
+  static std::shared_ptr<IndexSampler> Init(const std::string& name) {
+    std::shared_ptr<IndexSampler> instance = nullptr;
+    instance.reset(new T(name));
+    return instance;
+  }
+
+  virtual void init_layerwise_conf(const std::vector<int>& layer_sample_counts,
+                                   int start_sample_layer = 1, int seed = 0) {}
+  virtual void init_beamsearch_conf(const int64_t k) {}
+  virtual std::vector<std::vector<uint64_t>> sample(
+      const std::vector<std::vector<uint64_t>>& user_inputs,
+      const std::vector<uint64_t>& input_targets,
+      bool with_hierarchy = false) = 0;
+};
+
+class LayerWiseSampler : public IndexSampler {
+ public:
+  virtual ~LayerWiseSampler() {}
+  explicit LayerWiseSampler(const std::string& name) {
+    tree_ = IndexWrapper::GetInstance()->get_tree_index(name);
+  }
+
+  void init_layerwise_conf(const std::vector<int>& layer_sample_counts,
+                           int start_sample_layer, int seed) override {
+    seed_ = seed;
+    start_sample_layer_ = start_sample_layer;
+
+    PADDLE_ENFORCE_GT(
+        start_sample_layer_, 0,
+        paddle::platform::errors::InvalidArgument(
+            "start sampler layer = [%d], it should greater than 0.",
+            start_sample_layer_));
+    PADDLE_ENFORCE_LT(start_sample_layer_, tree_->Height(),
+                      paddle::platform::errors::InvalidArgument(
+                          "start sampler layer = [%d], it should less than "
+                          "max_layer, which is [%d].",
+                          start_sample_layer_, tree_->Height()));
+
+    size_t i = 0;
+    layer_counts_sum_ = 0;
+    layer_counts_.clear();
+    int cur_layer = start_sample_layer_;
+    while (cur_layer < tree_->Height()) {
+      int layer_sample_num = 1;
+      if (i < layer_sample_counts.size()) {
+        layer_sample_num = layer_sample_counts[i];
+      }
+      layer_counts_sum_ += layer_sample_num + 1;
+      layer_counts_.push_back(layer_sample_num);
+      VLOG(3) << "[INFO] level " << cur_layer
+              << " sample_layer_counts.push_back: " << layer_sample_num;
+      cur_layer += 1;
+      i += 1;
+    }
+    reverse(layer_counts_.begin(), layer_counts_.end());
+    VLOG(3) << "sample counts sum: " << layer_counts_sum_;
+  }
+  std::vector<std::vector<uint64_t>> sample(
+      const std::vector<std::vector<uint64_t>>& user_inputs,
+      const std::vector<uint64_t>& target_ids, bool with_hierarchy) override;
+
+ private:
+  std::vector<int> layer_counts_;
+  int64_t layer_counts_sum_{0};
+  std::shared_ptr<TreeIndex> tree_{nullptr};
+  int seed_{0};
+  int start_sample_layer_{1};
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
new file mode 100644
index 00000000000000..99fe4ca0c6d043
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/io/fs.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+
+namespace paddle {
+namespace distributed {
+
+std::shared_ptr<IndexWrapper> IndexWrapper::s_instance_(nullptr);
+
+int TreeIndex::Load(const std::string filename) {
+  int err_no;
+  auto fp = paddle::framework::fs_open_read(filename, &err_no, "");
+  PADDLE_ENFORCE_NE(
+      fp, nullptr,
+      platform::errors::InvalidArgument(
+          "Open file %s failed. Please check whether the file exists.",
+          filename));
+
+  int num = 0;
+  max_id_ = 0;
+  fake_node_.set_id(0);
+  fake_node_.set_is_leaf(false);
+  fake_node_.set_probability(0.0);
+  max_code_ = 0;
+  size_t ret = fread(&num, sizeof(num), 1, fp.get());
+  while (ret == 1 && num > 0) {
+    std::string content(num, '\0');
+    size_t read_num =
+        fread(const_cast<char*>(content.data()), 1, num, fp.get());
+    PADDLE_ENFORCE_EQ(
+        read_num, static_cast<size_t>(num),
+        platform::errors::InvalidArgument(
+            "Read from file: %s failed. Valid Format is "
+            "an integer representing the length of the following string, "
+            "and the string itself.We got an iteger[% d], "
+            "but the following string's length is [%d].",
+            filename, num, read_num));
+
+    KVItem item;
+    PADDLE_ENFORCE_EQ(
+        item.ParseFromString(content), true,
+        platform::errors::InvalidArgument("Parse from file: %s failed. It's "
+                                          "content can't be parsed by KVItem.",
+                                          filename));
+
+    if (item.key() == ".tree_meta") {
+      meta_.ParseFromString(item.value());
+    } else {
+      auto code = boost::lexical_cast<uint64_t>(item.key());
+      IndexNode node;
+      node.ParseFromString(item.value());
+      PADDLE_ENFORCE_NE(node.id(), 0,
+                        platform::errors::InvalidArgument(
+                            "Node'id should not be equel to zero."));
+      if (node.is_leaf()) {
+        id_codes_map_[node.id()] = code;
+      }
+      data_[code] = node;
+      if (node.id() > max_id_) {
+        max_id_ = node.id();
+      }
+      if (code > max_code_) {
+        max_code_ = code;
+      }
+    }
+    ret = fread(&num, sizeof(num), 1, fp.get());
+  }
+  total_nodes_num_ = data_.size();
+  max_code_ += 1;
+  return 0;
+}
+
+std::vector<IndexNode> TreeIndex::GetNodes(const std::vector<uint64_t>& codes) {
+  std::vector<IndexNode> nodes;
+  nodes.reserve(codes.size());
+  for (size_t i = 0; i < codes.size(); i++) {
+    if (CheckIsValid(codes[i])) {
+      nodes.push_back(data_.at(codes[i]));
+    } else {
+      nodes.push_back(fake_node_);
+    }
+  }
+  return nodes;
+}
+
+std::vector<uint64_t> TreeIndex::GetLayerCodes(int level) {
+  uint64_t level_num = static_cast<uint64_t>(std::pow(meta_.branch(), level));
+  uint64_t level_offset = level_num - 1;
+
+  std::vector<uint64_t> res;
+  res.reserve(level_num);
+  for (uint64_t i = 0; i < level_num; i++) {
+    auto code = level_offset + i;
+    if (CheckIsValid(code)) {
+      res.push_back(code);
+    }
+  }
+  return res;
+}
+
+std::vector<uint64_t> TreeIndex::GetAncestorCodes(
+    const std::vector<uint64_t>& ids, int level) {
+  std::vector<uint64_t> res;
+  res.reserve(ids.size());
+
+  int cur_level;
+  for (size_t i = 0; i < ids.size(); i++) {
+    if (id_codes_map_.find(ids[i]) == id_codes_map_.end()) {
+      res.push_back(max_code_);
+    } else {
+      auto code = id_codes_map_.at(ids[i]);
+      cur_level = meta_.height() - 1;
+
+      while (level >= 0 && cur_level > level) {
+        code = (code - 1) / meta_.branch();
+        cur_level--;
+      }
+      res.push_back(code);
+    }
+  }
+  return res;
+}
+
+std::vector<uint64_t> TreeIndex::GetChildrenCodes(uint64_t ancestor,
+                                                  int level) {
+  auto level_code_num = static_cast<uint64_t>(std::pow(meta_.branch(), level));
+  auto code_min = level_code_num - 1;
+  auto code_max = meta_.branch() * level_code_num - 1;
+
+  std::vector<uint64_t> parent;
+  parent.push_back(ancestor);
+  std::vector<uint64_t> res;
+  size_t p_idx = 0;
+  while (true) {
+    size_t p_size = parent.size();
+    for (; p_idx < p_size; p_idx++) {
+      for (int i = 0; i < meta_.branch(); i++) {
+        auto code = parent[p_idx] * meta_.branch() + i + 1;
+        if (data_.find(code) != data_.end()) parent.push_back(code);
+      }
+    }
+    if ((code_min <= parent[p_idx]) && (parent[p_idx] < code_max)) {
+      break;
+    }
+  }
+
+  return std::vector<uint64_t>(parent.begin() + p_idx, parent.end());
+}
+
+std::vector<uint64_t> TreeIndex::GetTravelCodes(uint64_t id, int start_level) {
+  std::vector<uint64_t> res;
+  PADDLE_ENFORCE_NE(id_codes_map_.find(id), id_codes_map_.end(),
+                    paddle::platform::errors::InvalidArgument(
+                        "id = %d doesn't exist in Tree.", id));
+  auto code = id_codes_map_.at(id);
+  int level = meta_.height() - 1;
+
+  while (level >= start_level) {
+    res.push_back(code);
+    code = (code - 1) / meta_.branch();
+    level--;
+  }
+  return res;
+}
+
+std::vector<IndexNode> TreeIndex::GetAllLeafs() {
+  std::vector<IndexNode> res;
+  res.reserve(id_codes_map_.size());
+  for (auto& ite : id_codes_map_) {
+    auto code = ite.second;
+    res.push_back(data_.at(code));
+  }
+  return res;
+}
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h
new file mode 100644
index 00000000000000..8fb8faf6c84a2d
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class Index {
+ public:
+  Index() {}
+  ~Index() {}
+};
+
+class TreeIndex : public Index {
+ public:
+  TreeIndex() {}
+  ~TreeIndex() {}
+
+  int Height() { return meta_.height(); }
+  int Branch() { return meta_.branch(); }
+  uint64_t TotalNodeNums() { return total_nodes_num_; }
+  uint64_t EmbSize() { return max_id_ + 1; }
+  int Load(const std::string path);
+
+  inline bool CheckIsValid(int code) {
+    if (data_.find(code) != data_.end()) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  std::vector<IndexNode> GetNodes(const std::vector<uint64_t>& codes);
+  std::vector<uint64_t> GetLayerCodes(int level);
+  std::vector<uint64_t> GetAncestorCodes(const std::vector<uint64_t>& ids,
+                                         int level);
+  std::vector<uint64_t> GetChildrenCodes(uint64_t ancestor, int level);
+  std::vector<uint64_t> GetTravelCodes(uint64_t id, int start_level);
+  std::vector<IndexNode> GetAllLeafs();
+
+  std::unordered_map<uint64_t, IndexNode> data_;
+  std::unordered_map<uint64_t, uint64_t> id_codes_map_;
+  uint64_t total_nodes_num_;
+  TreeMeta meta_;
+  uint64_t max_id_;
+  uint64_t max_code_;
+  IndexNode fake_node_;
+};
+
+using TreePtr = std::shared_ptr<TreeIndex>;
+
+class IndexWrapper {
+ public:
+  virtual ~IndexWrapper() {}
+  IndexWrapper() {}
+
+  void clear_tree() { tree_map.clear(); }
+
+  TreePtr get_tree_index(const std::string name) {
+    PADDLE_ENFORCE_NE(tree_map.find(name), tree_map.end(),
+                      paddle::platform::errors::InvalidArgument(
+                          "tree [%s] doesn't exist. Please insert it firstly "
+                          "by API[\' insert_tree_index \'].",
+                          name));
+    return tree_map[name];
+  }
+
+  void insert_tree_index(const std::string name, const std::string tree_path) {
+    if (tree_map.find(name) != tree_map.end()) {
+      VLOG(0) << "Tree " << name << " has already existed.";
+      return;
+    }
+    TreePtr tree = std::make_shared<TreeIndex>();
+    int ret = tree->Load(tree_path);
+    PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument(
+                                  "Load tree[%s] from path[%s] failed. Please "
+                                  "check whether the file exists.",
+                                  name, tree_path));
+    tree_map.insert(std::pair<std::string, TreePtr>{name, tree});
+  }
+
+  static std::shared_ptr<IndexWrapper> GetInstancePtr() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::IndexWrapper());
+    }
+    return s_instance_;
+  }
+
+  static IndexWrapper* GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::IndexWrapper());
+    }
+    return s_instance_.get();
+  }
+
+ private:
+  static std::shared_ptr<IndexWrapper> s_instance_;
+  std::unordered_map<std::string, TreePtr> tree_map;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/service/CMakeLists.txt
index bb3f6f1174da9d..d1f04e26ade728 100644
--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/service/CMakeLists.txt
@@ -16,6 +16,7 @@ set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -24,11 +25,13 @@ set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
+set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
 
-cc_library(downpour_server SRCS brpc_ps_server.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS})
-cc_library(downpour_client SRCS brpc_ps_client.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS})
+cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc
+ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
 
 cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS})
 cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
@@ -38,3 +41,6 @@ cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RP
 
 cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
+
+set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service)
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc
index 163526fe3b28c9..a6ad9d08f52fda 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
@@ -768,8 +768,8 @@ std::future<int32_t> BrpcPsClient::push_global_step(int table_id,
 
 std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
                                                size_t table_id,
-                                               const uint64_t *keys,
-                                               size_t num) {
+                                               const uint64_t *keys, size_t num,
+                                               bool is_training) {
   size_t request_call_num = _server_channels.size();
 
   auto shard_sorted_kvs = std::make_shared<
@@ -837,16 +837,27 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
     uint32_t kv_request_count = 0;
     size_t sorted_kv_size = sorted_kvs.size();
     auto &request_buffer = closure->cntl(i)->request_attachment();
+
+    request_buffer.append((void *)&is_training, sizeof(bool));
+    std::vector<uint32_t> keys_counter;
+    keys_counter.reserve(sorted_kv_size);
+
     for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) {
       ++kv_request_count;
+      uint32_t keys = 1;
       last_key = sorted_kvs[kv_idx].first;
       request_buffer.append((void *)&last_key, sizeof(uint64_t));
       while (kv_idx < sorted_kv_size - 1 &&
              last_key == sorted_kvs[kv_idx + 1].first) {
         ++kv_idx;
+        ++keys;
       }
+      keys_counter.push_back(keys);
     }
 
+    request_buffer.append((void *)keys_counter.data(),
+                          sizeof(uint32_t) * keys_counter.size());
+
     if (kv_request_count == 0) {
       closure->Run();
     } else {
@@ -869,8 +880,8 @@ std::future<int32_t> BrpcPsClient::send_client2client_msg(
   auto promise = std::make_shared<std::promise<int32_t>>();
   std::future<int> fut = promise->get_future();
   if (to_client_id >= _client_channels.size()) {
-    LOG(FATAL) << "to_client_id is out of range clients, which size is "
-               << _client_channels.size();
+    VLOG(0) << "to_client_id is out of range clients, which size is "
+            << _client_channels.size();
     promise->set_value(-1);
     return fut;
   }
@@ -956,7 +967,7 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
   }
 
   auto status = pull_sparse((float **)save_vec.data(), table_id,
-                            save_key.data(), save_key.size());
+                            save_key.data(), save_key.size(), true);
   status.wait();
 
   // create lod tensor
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/service/brpc_ps_client.h
index 8f9d2653864d1c..5192356e4b5e57 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/service/brpc_ps_client.h
@@ -148,7 +148,8 @@ class BrpcPsClient : public PSClient {
 
   virtual std::future<int32_t> pull_sparse(float **select_values,
                                            size_t table_id,
-                                           const uint64_t *keys, size_t num);
+                                           const uint64_t *keys, size_t num,
+                                           bool is_training);
 
   virtual std::future<int32_t> print_table_stat(uint32_t table_id);
 
@@ -170,9 +171,22 @@ class BrpcPsClient : public PSClient {
   virtual int32_t recv_and_save_table(const uint64_t table_id,
                                       const std::string &path);
 
- private:
+ protected:
+  virtual size_t get_server_nums() { return _server_channels.size(); }
+  inline brpc::Channel *get_sparse_channel(size_t server_id) {
+    return _server_channels[server_id][0].get();
+  }
+  inline brpc::Channel *get_dense_channel(size_t server_id) {
+    return _server_channels[server_id][1].get();
+  }
+  inline brpc::Channel *get_cmd_channel(size_t server_id) {
+    return _server_channels[server_id][2].get();
+  }
   virtual int32_t initialize() override;
 
+ private:
+  // virtual int32_t initialize() override;
+
   inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
                                       uint32_t shard_num) {
     return dense_dim_total / shard_num + 1;
@@ -184,16 +198,6 @@ class BrpcPsClient : public PSClient {
   std::future<int32_t> send_save_cmd(uint32_t table_id, int cmd_id,
                                      const std::vector<std::string> &param);
 
-  inline brpc::Channel *get_sparse_channel(size_t server_id) {
-    return _server_channels[server_id][0].get();
-  }
-  inline brpc::Channel *get_dense_channel(size_t server_id) {
-    return _server_channels[server_id][1].get();
-  }
-  inline brpc::Channel *get_cmd_channel(size_t server_id) {
-    return _server_channels[server_id][2].get();
-  }
-
   bool _running = false;
   bool _flushing = false;
   std::atomic<uint32_t> _async_call_num;  //异步请求计数
@@ -220,8 +224,6 @@ class BrpcPsClient : public PSClient {
                                                  size_t num,
                                                  void *done) override;
 
-  virtual size_t get_server_nums() { return _server_channels.size(); }
-
  private:
   int32_t start_client_service();
 
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc
index 8400e669182d67..a9370561a540be 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
 #include <thread>  // NOLINT
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/table.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -60,7 +61,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) {
   std::unique_lock<std::mutex> lock(mutex_);
 
   std::string ip_port = ip + ":" + std::to_string(port);
-  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  VLOG(0) << "running server with rank id: " << _rank
+          << ", endpoint: " << ip_port;
   brpc::ServerOptions options;
 
   int num_threads = std::thread::hardware_concurrency();
@@ -336,33 +338,39 @@ int32_t BrpcPsService::pull_sparse(Table *table,
                                    brpc::Controller *cntl) {
   platform::RecordEvent record_event("PsService->pull_sparse");
   CHECK_TABLE_EXIST(table, request, response)
-  thread_local std::string push_sparse_request_buffer;
+
   auto &req_io_buffer = cntl->request_attachment();
   auto req_buffer_size = req_io_buffer.size();
+
   if (req_buffer_size < 1) {
     set_response_code(response, -1, "req attachment is empty");
     return 0;
   }
+
   if (request.params_size() < 1) {
     set_response_code(response, -1,
                       "PsRequestMessage.params is requeired at "
                       "least 1 for num of sparse_key");
     return 0;
   }
+
   uint32_t num = *(uint32_t *)(request.params(0).c_str());
-  push_sparse_request_buffer.resize(0);
-  push_sparse_request_buffer.reserve(req_buffer_size);
-  const char *data = (const char *)cntl->request_attachment().fetch(
-      const_cast<char *>(push_sparse_request_buffer.data()), req_buffer_size);
-  /*
-  Attachment Content:
-  |---keysData---|
-  |---8*{num}B---|
-  */
-  const uint64_t *keys = (const uint64_t *)data;
+  auto dim = table->value_accesor()->select_dim();
+
+  thread_local std::string req_buffer;
+  req_buffer.reserve(req_buffer_size);
+
+  const void *data = cntl->request_attachment().fetch(
+      const_cast<char *>(req_buffer.data()), req_buffer_size);
+
+  auto value = PullSparseValue(num, dim);
+
+  value.DeserializeFromBytes(const_cast<void *>(data));
+
   std::vector<float> res_data;
-  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
-  table->pull_sparse(res_data.data(), keys, num);
+  res_data.resize(num * dim);
+  table->pull_sparse(res_data.data(), value);
+
   cntl->response_attachment().append((char *)res_data.data(),
                                      res_data.size() * sizeof(float));
   return 0;
@@ -538,7 +546,7 @@ int32_t BrpcPsService::stop_server(Table *table,
   auto *p_server = _server;
   std::thread t_stop([p_server]() {
     p_server->stop();
-    LOG(INFO) << "Server Stoped";
+    VLOG(3) << "Server Stoped";
   });
   t_stop.detach();
   return 0;
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index 096718768149c5..a356b77e73733e 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) {
 
   while (hp->h_addr_list[i] != NULL) {
     int_ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
-    VLOG(0) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip;
+    VLOG(3) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip;
     break;
   }
 
diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc
index 8699719e5cdcc8..3d5ab8e16d9020 100644
--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
@@ -320,9 +320,11 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id,
     push_g_vec.push_back(tensor->data<float>() + i * dim);
   }
 
+  bool training = true;
+
   auto status = _worker_ptr->pull_sparse(
       (float **)push_g_vec.data(), table_id,  // NOLINT
-      sparse_push_keys.data(), sparse_push_keys.size());
+      sparse_push_keys.data(), sparse_push_keys.size(), training);
   status.wait();
   return;
 }
diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h
index 043fe9d83dfc53..fa60cab2b58779 100644
--- a/paddle/fluid/distributed/service/communicator.h
+++ b/paddle/fluid/distributed/service/communicator.h
@@ -310,6 +310,8 @@ class Communicator {
     return _worker_ptr;
   }
 
+  RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; }
+
   std::shared_ptr<PSClient> _worker_ptr;  // pointer to worker
 
  protected:
diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/service/env.h
index 901aba0ad90c49..ca395a776afd4e 100644
--- a/paddle/fluid/distributed/service/env.h
+++ b/paddle/fluid/distributed/service/env.h
@@ -39,7 +39,7 @@ struct PSHost {
 
   // |---ip---|---port---|--rank--|
   // |-32bit--|--20bit---|--12bit-|
-  // for pslib
+
   uint64_t serialize_to_uint64() {
     uint64_t host_label = 0;
     host_label = inet_addr(ip.c_str());
@@ -175,14 +175,12 @@ class PSEnvironment {
     host.ip = ip;
     host.port = port;
     host.rank = rank;
-    if (sign_set.count(rank) > 0) {
-      LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port
-                   << ", rank:" << host.rank
-                   << " already register, ignore register";
-    } else {
+
+    if (sign_set.count(rank) == 0) {
       host_list.push_back(host);
       sign_set.insert(rank);
     }
+
     return 0;
   }
 
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
new file mode 100644
index 00000000000000..a6271cac83c9a9
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -0,0 +1,331 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include <algorithm>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "Eigen/Dense"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+
+void GraphPsService_Stub::service(
+    ::google::protobuf::RpcController *controller,
+    const ::paddle::distributed::PsRequestMessage *request,
+    ::paddle::distributed::PsResponseMessage *response,
+    ::google::protobuf::Closure *done) {
+  if (graph_service != NULL && local_channel == channel()) {
+    // VLOG(0)<<"use local";
+    task_pool->enqueue([this, controller, request, response, done]() -> int {
+      this->graph_service->service(controller, request, response, done);
+      return 0;
+    });
+  } else {
+    // VLOG(0)<<"use server";
+    PsService_Stub::service(controller, request, response, done);
+  }
+}
+
+int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+  int shard_num = get_shard_num();
+  int shard_per_server = shard_num % server_size == 0
+                             ? shard_num / server_size
+                             : shard_num / server_size + 1;
+  return id % shard_num / shard_per_server;
+}
+
+std::future<int32_t> GraphBrpcClient::get_node_feat(
+    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const std::vector<std::string> &feature_names,
+    std::vector<std::vector<std::string>> &res) {
+  std::vector<int> request2server;
+  std::vector<int> server2request(server_size, -1);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    if (server2request[server_index] == -1) {
+      server2request[server_index] = request2server.size();
+      request2server.push_back(server_index);
+    }
+  }
+  size_t request_call_num = request2server.size();
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    int request_idx = server2request[server_index];
+    node_id_buckets[request_idx].push_back(node_ids[query_idx]);
+    query_idx_buckets[request_idx].push_back(query_idx);
+  }
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num,
+      [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (int request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+            ++fail_num;
+          } else {
+            auto &res_io_buffer =
+                closure->cntl(request_idx)->response_attachment();
+            butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+            size_t bytes_size = io_buffer_itr.bytes_left();
+            std::unique_ptr<char[]> buffer_wrapper(new char[bytes_size]);
+            char *buffer = buffer_wrapper.get();
+            io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+
+            for (size_t feat_idx = 0; feat_idx < feature_names.size();
+                 ++feat_idx) {
+              for (size_t node_idx = 0;
+                   node_idx < query_idx_buckets.at(request_idx).size();
+                   ++node_idx) {
+                int query_idx = query_idx_buckets.at(request_idx).at(node_idx);
+                size_t feat_len = *(size_t *)(buffer);
+                buffer += sizeof(size_t);
+                auto feature = std::string(buffer, feat_len);
+                res[feat_idx][query_idx] = feature;
+                buffer += feat_len;
+              }
+            }
+          }
+          if (fail_num == request_call_num) {
+            ret = -1;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = request2server[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_GET_NODE_FEAT);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = node_id_buckets[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)node_id_buckets[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    std::string joint_feature_name =
+        paddle::string::join_strings(feature_names, '\t');
+    closure->request(request_idx)
+        ->add_params(joint_feature_name.c_str(), joint_feature_name.size());
+
+    PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+
+  return fut;
+}
+// char* &buffer,int &actual_size
+std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
+    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
+    std::vector<std::vector<std::pair<uint64_t, float>>> &res) {
+  std::vector<int> request2server;
+  std::vector<int> server2request(server_size, -1);
+  res.clear();
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    if (server2request[server_index] == -1) {
+      server2request[server_index] = request2server.size();
+      request2server.push_back(server_index);
+    }
+    res.push_back(std::vector<std::pair<uint64_t, float>>());
+  }
+  size_t request_call_num = request2server.size();
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    int request_idx = server2request[server_index];
+    node_id_buckets[request_idx].push_back(node_ids[query_idx]);
+    query_idx_buckets[request_idx].push_back(query_idx);
+  }
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num,
+      [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (int request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+            ++fail_num;
+          } else {
+            auto &res_io_buffer =
+                closure->cntl(request_idx)->response_attachment();
+            butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+            size_t bytes_size = io_buffer_itr.bytes_left();
+            std::unique_ptr<char[]> buffer_wrapper(new char[bytes_size]);
+            char *buffer = buffer_wrapper.get();
+            io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+
+            size_t node_num = *(size_t *)buffer;
+            int *actual_sizes = (int *)(buffer + sizeof(size_t));
+            char *node_buffer =
+                buffer + sizeof(size_t) + sizeof(int) * node_num;
+
+            int offset = 0;
+            for (size_t node_idx = 0; node_idx < node_num; ++node_idx) {
+              int query_idx = query_idx_buckets.at(request_idx).at(node_idx);
+              int actual_size = actual_sizes[node_idx];
+              int start = 0;
+              while (start < actual_size) {
+                res[query_idx].push_back(
+                    {*(uint64_t *)(node_buffer + offset + start),
+                     *(float *)(node_buffer + offset + start +
+                                GraphNode::id_size)});
+                start += GraphNode::id_size + GraphNode::weight_size;
+              }
+              offset += actual_size;
+            }
+          }
+          if (fail_num == request_call_num) {
+            ret = -1;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = request2server[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = node_id_buckets[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)node_id_buckets[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    closure->request(request_idx)
+        ->add_params((char *)&sample_size, sizeof(int));
+    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::random_sample_nodes(
+    uint32_t table_id, int server_index, int sample_size,
+    std::vector<uint64_t> &ids) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES) != 0) {
+      ret = -1;
+    } else {
+      auto &res_io_buffer = closure->cntl(0)->response_attachment();
+      butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+      size_t bytes_size = io_buffer_itr.bytes_left();
+      char buffer[bytes_size];
+      auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+      int index = 0;
+      while (index < bytes_size) {
+        ids.push_back(*(uint64_t *)(buffer + index));
+        index += GraphNode::id_size;
+      }
+    }
+    closure->set_promise_value(ret);
+  });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  ;
+  closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->add_params((char *)&sample_size, sizeof(int));
+  ;
+  // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+  GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::pull_graph_list(
+    uint32_t table_id, int server_index, int start, int size, int step,
+    std::vector<FeatureNode> &res) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_PULL_GRAPH_LIST) != 0) {
+      ret = -1;
+    } else {
+      auto &res_io_buffer = closure->cntl(0)->response_attachment();
+      butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+      size_t bytes_size = io_buffer_itr.bytes_left();
+      char buffer[bytes_size];
+      io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+      int index = 0;
+      while (index < bytes_size) {
+        FeatureNode node;
+        node.recover_from_buffer(buffer + index);
+        index += node.get_size(false);
+        res.push_back(node);
+      }
+    }
+    closure->set_promise_value(ret);
+  });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  closure->request(0)->set_cmd_id(PS_PULL_GRAPH_LIST);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->add_params((char *)&start, sizeof(int));
+  closure->request(0)->add_params((char *)&size, sizeof(int));
+  closure->request(0)->add_params((char *)&step, sizeof(int));
+  // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+  GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+int32_t GraphBrpcClient::initialize() {
+  // set_shard_num(_config.shard_num());
+  BrpcPsClient::initialize();
+  server_size = get_server_nums();
+  graph_service = NULL;
+  local_channel = NULL;
+  return 0;
+}
+}
+}
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h
new file mode 100644
index 00000000000000..4e6775a4bedaf1
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <utility>
+#include "ThreadPool.h"
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace distributed {
+
+class GraphPsService_Stub : public PsService_Stub {
+ public:
+  GraphPsService_Stub(::google::protobuf::RpcChannel* channel,
+                      ::google::protobuf::RpcChannel* local_channel = NULL,
+                      GraphBrpcService* service = NULL, int thread_num = 1)
+      : PsService_Stub(channel) {
+    this->local_channel = local_channel;
+    this->graph_service = service;
+    task_pool.reset(new ::ThreadPool(thread_num));
+  }
+  virtual ~GraphPsService_Stub() {}
+
+  // implements PsService ------------------------------------------
+  GraphBrpcService* graph_service;
+  std::shared_ptr<::ThreadPool> task_pool;
+  ::google::protobuf::RpcChannel* local_channel;
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(GraphPsService_Stub);
+  void service(::google::protobuf::RpcController* controller,
+               const ::paddle::distributed::PsRequestMessage* request,
+               ::paddle::distributed::PsResponseMessage* response,
+               ::google::protobuf::Closure* done);
+};
+class GraphBrpcClient : public BrpcPsClient {
+ public:
+  GraphBrpcClient() {}
+  virtual ~GraphBrpcClient() {}
+  // given a batch of nodes, sample graph_neighboors for each of them
+  virtual std::future<int32_t> batch_sample_neighboors(
+      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
+      std::vector<std::vector<std::pair<uint64_t, float>>>& res);
+
+  virtual std::future<int32_t> pull_graph_list(uint32_t table_id,
+                                               int server_index, int start,
+                                               int size, int step,
+                                               std::vector<FeatureNode>& res);
+  virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
+                                                   int server_index,
+                                                   int sample_size,
+                                                   std::vector<uint64_t>& ids);
+  virtual std::future<int32_t> get_node_feat(
+      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const std::vector<std::string>& feature_names,
+      std::vector<std::vector<std::string>>& res);
+  virtual int32_t initialize();
+  int get_shard_num() { return shard_num; }
+  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
+  int get_server_index_by_id(uint64_t id);
+  void set_local_channel(int index) {
+    this->local_channel = get_cmd_channel(index);
+  }
+  void set_local_graph_service(GraphBrpcService* graph_service) {
+    this->graph_service = graph_service;
+  }
+  GraphPsService_Stub getServiceStub(::google::protobuf::RpcChannel* channel,
+                                     int thread_num = 1) {
+    return GraphPsService_Stub(channel, local_channel, graph_service,
+                               thread_num);
+  }
+
+ private:
+  int shard_num;
+  size_t server_size;
+  ::google::protobuf::RpcChannel* local_channel;
+  GraphBrpcService* graph_service;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc
new file mode 100644
index 00000000000000..bdd926278b624b
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
@@ -0,0 +1,348 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+
+#include <thread>  // NOLINT
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+
+int32_t GraphBrpcServer::initialize() {
+  auto &service_config = _config.downpour_server_param().service_param();
+  if (!service_config.has_service_class()) {
+    LOG(ERROR) << "miss service_class in ServerServiceParameter";
+    return -1;
+  }
+  auto *service =
+      CREATE_PSCORE_CLASS(PsBaseService, service_config.service_class());
+  if (service == NULL) {
+    LOG(ERROR) << "service is unregistered, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+
+  _service.reset(service);
+  if (service->configure(this) != 0 || service->initialize() != 0) {
+    LOG(ERROR) << "service initialize failed, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+  if (_server.AddService(service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(ERROR) << "service add to brpc failed, service:"
+               << service_config.service_class();
+    return -1;
+  }
+  return 0;
+}
+
+uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) {
+  std::unique_lock<std::mutex> lock(mutex_);
+
+  std::string ip_port = ip + ":" + std::to_string(port);
+  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  brpc::ServerOptions options;
+
+  int num_threads = std::thread::hardware_concurrency();
+  auto trainers = _environment->get_trainers();
+  options.num_threads = trainers > num_threads ? trainers : num_threads;
+
+  if (_server.Start(ip_port.c_str(), &options) != 0) {
+    LOG(ERROR) << "GraphBrpcServer start failed, ip_port=" << ip_port;
+    return 0;
+  }
+  _environment->registe_ps_server(ip, port, _rank);
+  return 0;
+}
+
+int32_t GraphBrpcServer::port() { return _server.listen_address().port; }
+
+int32_t GraphBrpcService::initialize() {
+  _is_initialize_shard_info = false;
+  _service_handler_map[PS_STOP_SERVER] = &GraphBrpcService::stop_server;
+  _service_handler_map[PS_LOAD_ONE_TABLE] = &GraphBrpcService::load_one_table;
+  _service_handler_map[PS_LOAD_ALL_TABLE] = &GraphBrpcService::load_all_table;
+
+  _service_handler_map[PS_PRINT_TABLE_STAT] =
+      &GraphBrpcService::print_table_stat;
+  _service_handler_map[PS_BARRIER] = &GraphBrpcService::barrier;
+  _service_handler_map[PS_START_PROFILER] = &GraphBrpcService::start_profiler;
+  _service_handler_map[PS_STOP_PROFILER] = &GraphBrpcService::stop_profiler;
+
+  _service_handler_map[PS_PULL_GRAPH_LIST] = &GraphBrpcService::pull_graph_list;
+  _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBOORS] =
+      &GraphBrpcService::graph_random_sample_neighboors;
+  _service_handler_map[PS_GRAPH_SAMPLE_NODES] =
+      &GraphBrpcService::graph_random_sample_nodes;
+  _service_handler_map[PS_GRAPH_GET_NODE_FEAT] =
+      &GraphBrpcService::graph_get_node_feat;
+
+  // shard初始化,server启动后才可从env获取到server_list的shard信息
+  initialize_shard_info();
+
+  return 0;
+}
+
+#define CHECK_TABLE_EXIST(table, request, response)        \
+  if (table == NULL) {                                     \
+    std::string err_msg("table not found with table_id:"); \
+    err_msg.append(std::to_string(request.table_id()));    \
+    set_response_code(response, -1, err_msg.c_str());      \
+    return -1;                                             \
+  }
+
+int32_t GraphBrpcService::initialize_shard_info() {
+  if (!_is_initialize_shard_info) {
+    std::lock_guard<std::mutex> guard(_initialize_shard_mutex);
+    if (_is_initialize_shard_info) {
+      return 0;
+    }
+    size_t shard_num = _server->environment()->get_ps_servers().size();
+    auto &table_map = *(_server->table());
+    for (auto itr : table_map) {
+      itr.second->set_shard(_rank, shard_num);
+    }
+    _is_initialize_shard_info = true;
+  }
+  return 0;
+}
+
+void GraphBrpcService::service(google::protobuf::RpcController *cntl_base,
+                               const PsRequestMessage *request,
+                               PsResponseMessage *response,
+                               google::protobuf::Closure *done) {
+  brpc::ClosureGuard done_guard(done);
+  std::string log_label("ReceiveCmd-");
+  if (!request->has_table_id()) {
+    set_response_code(*response, -1, "PsRequestMessage.tabel_id is required");
+    return;
+  }
+
+  response->set_err_code(0);
+  response->set_err_msg("");
+  auto *table = _server->table(request->table_id());
+  brpc::Controller *cntl = static_cast<brpc::Controller *>(cntl_base);
+  auto itr = _service_handler_map.find(request->cmd_id());
+  if (itr == _service_handler_map.end()) {
+    std::string err_msg(
+        "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:");
+    err_msg.append(std::to_string(request->cmd_id()));
+    set_response_code(*response, -1, err_msg.c_str());
+    return;
+  }
+  serviceFunc handler_func = itr->second;
+  int service_ret = (this->*handler_func)(table, *request, *response, cntl);
+  if (service_ret != 0) {
+    response->set_err_code(service_ret);
+    response->set_err_msg("server internal error");
+  }
+}
+
+int32_t GraphBrpcService::barrier(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.params is requeired at "
+                      "least 1 for num of sparse_key");
+    return 0;
+  }
+
+  auto trainer_id = request.client_id();
+  auto barrier_type = request.params(0);
+  table->barrier(trainer_id, barrier_type);
+  return 0;
+}
+
+int32_t GraphBrpcService::print_table_stat(Table *table,
+                                           const PsRequestMessage &request,
+                                           PsResponseMessage &response,
+                                           brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  std::pair<int64_t, int64_t> ret = table->print_table_stat();
+  paddle::framework::BinaryArchive ar;
+  ar << ret.first << ret.second;
+  std::string table_info(ar.Buffer(), ar.Length());
+  response.set_data(table_info);
+
+  return 0;
+}
+
+int32_t GraphBrpcService::load_one_table(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "PsRequestMessage.datas is requeired at least 2 for path & load_param");
+    return -1;
+  }
+  if (table->load(request.params(0), request.params(1)) != 0) {
+    set_response_code(response, -1, "table load failed");
+    return -1;
+  }
+  return 0;
+}
+
+int32_t GraphBrpcService::load_all_table(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  auto &table_map = *(_server->table());
+  for (auto &itr : table_map) {
+    if (load_one_table(itr.second.get(), request, response, cntl) != 0) {
+      LOG(ERROR) << "load table[" << itr.first << "] failed";
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int32_t GraphBrpcService::stop_server(Table *table,
+                                      const PsRequestMessage &request,
+                                      PsResponseMessage &response,
+                                      brpc::Controller *cntl) {
+  GraphBrpcServer *p_server = (GraphBrpcServer *)_server;
+  std::thread t_stop([p_server]() {
+    p_server->stop();
+    LOG(INFO) << "Server Stoped";
+  });
+  p_server->export_cv()->notify_all();
+  t_stop.detach();
+  return 0;
+}
+
+int32_t GraphBrpcService::stop_profiler(Table *table,
+                                        const PsRequestMessage &request,
+                                        PsResponseMessage &response,
+                                        brpc::Controller *cntl) {
+  platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                            string::Sprintf("server_%s_profile", _rank));
+  return 0;
+}
+
+int32_t GraphBrpcService::start_profiler(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  platform::EnableProfiler(platform::ProfilerState::kCPU);
+  return 0;
+}
+
+int32_t GraphBrpcService::pull_graph_list(Table *table,
+                                          const PsRequestMessage &request,
+                                          PsResponseMessage &response,
+                                          brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 3) {
+    set_response_code(response, -1,
+                      "pull_graph_list request requires at least 3 arguments");
+    return 0;
+  }
+  int start = *(int *)(request.params(0).c_str());
+  int size = *(int *)(request.params(1).c_str());
+  int step = *(int *)(request.params(2).c_str());
+  std::unique_ptr<char[]> buffer;
+  int actual_size;
+  ((GraphTable *)table)
+      ->pull_graph_list(start, size, buffer, actual_size, false, step);
+  cntl->response_attachment().append(buffer.get(), actual_size);
+  return 0;
+}
+int32_t GraphBrpcService::graph_random_sample_neighboors(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "graph_random_sample request requires at least 2 arguments");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  std::vector<std::unique_ptr<char[]>> buffers(node_num);
+  std::vector<int> actual_sizes(node_num, 0);
+  ((GraphTable *)table)
+      ->random_sample_neighboors(node_data, sample_size, buffers, actual_sizes);
+
+  cntl->response_attachment().append(&node_num, sizeof(size_t));
+  cntl->response_attachment().append(actual_sizes.data(),
+                                     sizeof(int) * node_num);
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    cntl->response_attachment().append(buffers[idx].get(), actual_sizes[idx]);
+  }
+  return 0;
+}
+int32_t GraphBrpcService::graph_random_sample_nodes(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  size_t size = *(uint64_t *)(request.params(0).c_str());
+  std::unique_ptr<char[]> buffer;
+  int actual_size;
+  if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
+      0) {
+    cntl->response_attachment().append(buffer.get(), actual_size);
+  } else
+    cntl->response_attachment().append(NULL, 0);
+
+  return 0;
+}
+
+int32_t GraphBrpcService::graph_get_node_feat(Table *table,
+                                              const PsRequestMessage &request,
+                                              PsResponseMessage &response,
+                                              brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "graph_get_node_feat request requires at least 2 arguments");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+
+  std::vector<std::string> feature_names =
+      paddle::string::split_string<std::string>(request.params(1), "\t");
+
+  std::vector<std::vector<std::string>> feature(
+      feature_names.size(), std::vector<std::string>(node_num));
+
+  ((GraphTable *)table)->get_node_feat(node_ids, feature_names, feature);
+
+  for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+    for (size_t node_idx = 0; node_idx < node_num; ++node_idx) {
+      size_t feat_len = feature[feat_idx][node_idx].size();
+      cntl->response_attachment().append(&feat_len, sizeof(size_t));
+      cntl->response_attachment().append(feature[feat_idx][node_idx].data(),
+                                         feat_len);
+    }
+  }
+
+  return 0;
+}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h
new file mode 100644
index 00000000000000..32c572f9e6c2bf
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/server.h"
+#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include "paddle/fluid/distributed/table/table.h"
+namespace paddle {
+namespace distributed {
+class GraphBrpcServer : public PSServer {
+ public:
+  GraphBrpcServer() {}
+  virtual ~GraphBrpcServer() {}
+  PsBaseService *get_service() { return _service.get(); }
+  virtual uint64_t start(const std::string &ip, uint32_t port);
+  virtual int32_t stop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (stoped_) return 0;
+    stoped_ = true;
+    // cv_.notify_all();
+    _server.Stop(1000);
+    _server.Join();
+    return 0;
+  }
+  virtual int32_t port();
+
+  std::condition_variable *export_cv() { return &cv_; }
+
+ private:
+  virtual int32_t initialize();
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  bool stoped_ = false;
+  brpc::Server _server;
+  std::shared_ptr<PsBaseService> _service;
+  std::vector<std::shared_ptr<brpc::Channel>> _pserver_channels;
+};
+
+class GraphBrpcService;
+
+typedef int32_t (GraphBrpcService::*serviceFunc)(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl);
+
+class GraphBrpcService : public PsBaseService {
+ public:
+  virtual int32_t initialize() override;
+
+  virtual void service(::google::protobuf::RpcController *controller,
+                       const PsRequestMessage *request,
+                       PsResponseMessage *response,
+                       ::google::protobuf::Closure *done) override;
+
+ protected:
+  std::unordered_map<int32_t, serviceFunc> _service_handler_map;
+  int32_t initialize_shard_info();
+  int32_t pull_graph_list(Table *table, const PsRequestMessage &request,
+                          PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t graph_random_sample_neighboors(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl);
+  int32_t graph_random_sample_nodes(Table *table,
+                                    const PsRequestMessage &request,
+                                    PsResponseMessage &response,
+                                    brpc::Controller *cntl);
+  int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request,
+                              PsResponseMessage &response,
+                              brpc::Controller *cntl);
+  int32_t barrier(Table *table, const PsRequestMessage &request,
+                  PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_one_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_all_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_server(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t start_profiler(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_profiler(Table *table, const PsRequestMessage &request,
+                        PsResponseMessage &response, brpc::Controller *cntl);
+
+  int32_t print_table_stat(Table *table, const PsRequestMessage &request,
+                           PsResponseMessage &response, brpc::Controller *cntl);
+
+ private:
+  bool _is_initialize_shard_info;
+  std::mutex _initialize_shard_mutex;
+  std::unordered_map<int32_t, serviceHandlerFunc> _msg_handler_map;
+  std::vector<float> _ori_values;
+  const int sample_nodes_ranges = 23;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc
new file mode 100644
index 00000000000000..61e4e0cf7bb915
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_py_service.cc
@@ -0,0 +1,325 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include <thread>  // NOLINT
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+std::vector<std::string> GraphPyService::split(std::string& str,
+                                               const char pattern) {
+  std::vector<std::string> res;
+  std::stringstream input(str);
+  std::string temp;
+  while (std::getline(input, temp, pattern)) {
+    res.push_back(temp);
+  }
+  return res;
+}
+
+void GraphPyService::add_table_feat_conf(std::string table_name,
+                                         std::string feat_name,
+                                         std::string feat_dtype,
+                                         int32_t feat_shape) {
+  if (this->table_id_map.count(table_name)) {
+    this->table_feat_conf_table_name.push_back(table_name);
+    this->table_feat_conf_feat_name.push_back(feat_name);
+    this->table_feat_conf_feat_dtype.push_back(feat_dtype);
+    this->table_feat_conf_feat_shape.push_back(feat_shape);
+  }
+}
+
+void GraphPyService::set_up(std::string ips_str, int shard_num,
+                            std::vector<std::string> node_types,
+                            std::vector<std::string> edge_types) {
+  set_shard_num(shard_num);
+  set_num_node_types(node_types.size());
+
+  for (size_t table_id = 0; table_id < node_types.size(); table_id++) {
+    this->table_id_map[node_types[table_id]] = this->table_id_map.size();
+  }
+  for (size_t table_id = 0; table_id < edge_types.size(); table_id++) {
+    this->table_id_map[edge_types[table_id]] = this->table_id_map.size();
+  }
+  std::istringstream stream(ips_str);
+  std::string ip;
+  server_size = 0;
+  std::vector<std::string> ips_list = split(ips_str, ';');
+  int index = 0;
+  for (auto ips : ips_list) {
+    auto ip_and_port = split(ips, ':');
+    server_list.push_back(ip_and_port[0]);
+    port_list.push_back(ip_and_port[1]);
+    uint32_t port = stoul(ip_and_port[1]);
+    auto ph_host = paddle::distributed::PSHost(ip_and_port[0], port, index);
+    host_sign_list.push_back(ph_host.serialize_to_string());
+    index++;
+  }
+}
+void GraphPyClient::start_client() {
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list, servers_);
+  worker_ptr = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr->configure(worker_proto, dense_regions, _ps_env, client_id);
+  worker_ptr->set_shard_num(get_shard_num());
+}
+void GraphPyServer::start_server(bool block) {
+  std::string ip = server_list[rank];
+  uint32_t port = std::stoul(port_list[rank]);
+  ::paddle::distributed::PSParameter server_proto = this->GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&this->host_sign_list,
+                         this->host_sign_list.size());  // test
+  pserver_ptr = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  VLOG(0) << "pserver-ptr created ";
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec);
+  pserver_ptr->start(ip, port);
+  std::condition_variable* cv_ = pserver_ptr->export_cv();
+  if (block) {
+    std::mutex mutex_;
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_->wait(lock);
+  }
+}
+::paddle::distributed::PSParameter GraphPyServer::GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* sparse_table_proto =
+        downpour_server_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first,
+                                table_type, feat_name, feat_dtype, feat_shape);
+  }
+
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GraphPyClient::GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+        downpour_worker_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(worker_sparse_table_proto, tuple.second,
+                                tuple.first, table_type, feat_name, feat_dtype,
+                                feat_shape);
+  }
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* sparse_table_proto =
+        downpour_server_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first,
+                                table_type, feat_name, feat_dtype, feat_shape);
+  }
+
+  return worker_fleet_desc;
+}
+void GraphPyClient::load_edge_file(std::string name, std::string filepath,
+                                   bool reverse) {
+  // 'e' means load edge
+  std::string params = "e";
+  if (reverse) {
+    // 'e<' means load edges from $2 to $1
+    params += "<";
+  } else {
+    // 'e>' means load edges from $1 to $2
+    params += ">";
+  }
+  if (this->table_id_map.count(name)) {
+    VLOG(0) << "loadding data with type " << name << " from " << filepath;
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->load(table_id, std::string(filepath), params);
+    status.wait();
+  }
+}
+
+void GraphPyClient::load_node_file(std::string name, std::string filepath) {
+  // 'n' means load nodes and 'node_type' follows
+  std::string params = "n" + name;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->load(table_id, std::string(filepath), params);
+    status.wait();
+  }
+}
+std::vector<std::vector<std::pair<uint64_t, float>>>
+GraphPyClient::batch_sample_neighboors(std::string name,
+                                       std::vector<uint64_t> node_ids,
+                                       int sample_size) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> v;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->batch_sample_neighboors(table_id, node_ids, sample_size, v);
+    status.wait();
+  }
+  return v;
+}
+
+std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                         int server_index,
+                                                         int sample_size) {
+  std::vector<uint64_t> v;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->random_sample_nodes(table_id, server_index, sample_size, v);
+    status.wait();
+  }
+  return v;
+}
+
+// (name, dtype, ndarray)
+std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
+    std::string node_type, std::vector<uint64_t> node_ids,
+    std::vector<std::string> feature_names) {
+  std::vector<std::vector<std::string>> v(
+      feature_names.size(), std::vector<std::string>(node_ids.size()));
+  if (this->table_id_map.count(node_type)) {
+    uint32_t table_id = this->table_id_map[node_type];
+    auto status =
+        worker_ptr->get_node_feat(table_id, node_ids, feature_names, v);
+    status.wait();
+  }
+  return v;
+}
+
+std::vector<FeatureNode> GraphPyClient::pull_graph_list(std::string name,
+                                                        int server_index,
+                                                        int start, int size,
+                                                        int step) {
+  std::vector<FeatureNode> res;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status = worker_ptr->pull_graph_list(table_id, server_index, start,
+                                              size, step, res);
+    status.wait();
+  }
+  return res;
+}
+
+void GraphPyClient::stop_server() {
+  VLOG(0) << "going to stop server";
+  std::unique_lock<std::mutex> lock(mutex_);
+  if (stoped_) return;
+  auto status = this->worker_ptr->stop_server();
+  if (status.get() == 0) stoped_ = true;
+}
+void GraphPyClient::finalize_worker() { this->worker_ptr->finalize_worker(); }
+}
+}
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
new file mode 100644
index 00000000000000..e185f23e3d240f
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -0,0 +1,178 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+namespace paddle {
+namespace distributed {
+class GraphPyService {
+ protected:
+  std::vector<std::string> server_list, port_list, host_sign_list;
+  int server_size, shard_num;
+  int num_node_types;
+  std::unordered_map<std::string, uint32_t> table_id_map;
+  std::vector<std::string> table_feat_conf_table_name;
+  std::vector<std::string> table_feat_conf_feat_name;
+  std::vector<std::string> table_feat_conf_feat_dtype;
+  std::vector<int32_t> table_feat_conf_feat_shape;
+
+  // std::thread *server_thread, *client_thread;
+
+  // std::shared_ptr<paddle::distributed::PSServer> pserver_ptr;
+
+  // std::shared_ptr<paddle::distributed::PSClient> worker_ptr;
+
+ public:
+  // std::shared_ptr<paddle::distributed::PSServer> get_ps_server() {
+  //   return pserver_ptr;
+  // }
+  // std::shared_ptr<paddle::distributed::PSClient> get_ps_client() {
+  //   return worker_ptr;
+  // }
+  int get_shard_num() { return shard_num; }
+  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
+  void GetDownpourSparseTableProto(
+      ::paddle::distributed::TableParameter* sparse_table_proto,
+      uint32_t table_id, std::string table_name, std::string table_type,
+      std::vector<std::string> feat_name, std::vector<std::string> feat_dtype,
+      std::vector<int32_t> feat_shape) {
+    sparse_table_proto->set_table_id(table_id);
+    sparse_table_proto->set_table_class("GraphTable");
+    sparse_table_proto->set_shard_num(shard_num);
+    sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+    ::paddle::distributed::TableAccessorParameter* accessor_proto =
+        sparse_table_proto->mutable_accessor();
+
+    ::paddle::distributed::CommonAccessorParameter* common_proto =
+        sparse_table_proto->mutable_common();
+
+    // Set GraphTable Parameter
+    common_proto->set_table_name(table_name);
+    common_proto->set_name(table_type);
+    for (size_t i = 0; i < feat_name.size(); i++) {
+      common_proto->add_params(feat_dtype[i]);
+      common_proto->add_dims(feat_shape[i]);
+      common_proto->add_attributes(feat_name[i]);
+    }
+
+    accessor_proto->set_accessor_class("CommMergeAccessor");
+  }
+
+  void set_server_size(int server_size) { this->server_size = server_size; }
+  void set_num_node_types(int num_node_types) {
+    this->num_node_types = num_node_types;
+  }
+  int get_server_size(int server_size) { return server_size; }
+  std::vector<std::string> split(std::string& str, const char pattern);
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types);
+
+  void add_table_feat_conf(std::string node_type, std::string feat_name,
+                           std::string feat_dtype, int32_t feat_shape);
+};
+class GraphPyServer : public GraphPyService {
+ public:
+  GraphPyServer() {}
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types, int rank) {
+    set_rank(rank);
+    GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
+  }
+  int get_rank() { return rank; }
+  void set_rank(int rank) { this->rank = rank; }
+
+  void start_server(bool block = true);
+  ::paddle::distributed::PSParameter GetServerProto();
+  std::shared_ptr<paddle::distributed::GraphBrpcServer> get_ps_server() {
+    return pserver_ptr;
+  }
+
+ protected:
+  int rank;
+  std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr;
+  std::thread* server_thread;
+};
+class GraphPyClient : public GraphPyService {
+ public:
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types, int client_id) {
+    set_client_id(client_id);
+    GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
+  }
+  std::shared_ptr<paddle::distributed::GraphBrpcClient> get_ps_client() {
+    return worker_ptr;
+  }
+  void bind_local_server(int local_channel_index, GraphPyServer& server) {
+    worker_ptr->set_local_channel(local_channel_index);
+    worker_ptr->set_local_graph_service(
+        (paddle::distributed::GraphBrpcService*)server.get_ps_server()
+            ->get_service());
+  }
+  void stop_server();
+  void finalize_worker();
+  void load_edge_file(std::string name, std::string filepath, bool reverse);
+  void load_node_file(std::string name, std::string filepath);
+  int get_client_id() { return client_id; }
+  void set_client_id(int client_id) { this->client_id = client_id; }
+  void start_client();
+  std::vector<std::vector<std::pair<uint64_t, float>>> batch_sample_neighboors(
+      std::string name, std::vector<uint64_t> node_ids, int sample_size);
+  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
+                                            int sample_size);
+  std::vector<std::vector<std::string>> get_node_feat(
+      std::string node_type, std::vector<uint64_t> node_ids,
+      std::vector<std::string> feature_names);
+  std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,
+                                           int start, int size, int step = 1);
+  ::paddle::distributed::PSParameter GetWorkerProto();
+
+ protected:
+  mutable std::mutex mutex_;
+  int client_id;
+  std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr;
+  std::thread* client_thread;
+  bool stoped_ = false;
+};
+}
+}
diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc
index 095b5dee0b28e4..d45f41a0f58de3 100644
--- a/paddle/fluid/distributed/service/ps_client.cc
+++ b/paddle/fluid/distributed/service/ps_client.cc
@@ -15,11 +15,15 @@
 #include "paddle/fluid/distributed/service/ps_client.h"
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/ps_local_client.h"
 #include "paddle/fluid/distributed/table/table.h"
 
 namespace paddle {
 namespace distributed {
 REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient);
+REGISTER_PSCORE_CLASS(PSClient, PsLocalClient);
+REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient);
 
 int32_t PSClient::configure(
     const PSParameter &config,
@@ -78,8 +82,7 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) {
   }
 
   TableManager::instance().initialize();
-  LOG(INFO) << "Create PSClient[" << service_param.client_class()
-            << "] success";
+  VLOG(3) << "Create PSClient[" << service_param.client_class() << "] success";
   return client;
 }
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h
index 50f5802c63a253..74a1e0dde71fc4 100644
--- a/paddle/fluid/distributed/service/ps_client.h
+++ b/paddle/fluid/distributed/service/ps_client.h
@@ -24,16 +24,11 @@
 #include "paddle/fluid/distributed/service/env.h"
 #include "paddle/fluid/distributed/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
 
 namespace paddle {
 namespace distributed {
 
-class PSEnvironment;
-class PsRequestMessage;
-class PsResponseMessage;
-class ValueAccessor;
-struct Region;
-
 using paddle::distributed::PsRequestMessage;
 using paddle::distributed::PsResponseMessage;
 
@@ -117,10 +112,22 @@ class PSClient {
   // future结束前keys和values缓冲区不能再次使用
   // 整合多个线程请求的keys，聚集并分散发送到server
   // 返回结果后，遍历buffer并对values赋值
+  // is_training 用于区分请求是训练/预测，server端对于特征和准入会有不同的处理.
   virtual std::future<int32_t> pull_sparse(float **select_values,
                                            size_t table_id,
-                                           const uint64_t *keys,
-                                           size_t num) = 0;
+                                           const uint64_t *keys, size_t num,
+                                           bool is_training) = 0;
+
+  virtual ::std::future<int32_t> pull_sparse_ptr(char **select_values,
+                                                 size_t table_id,
+                                                 const uint64_t *keys,
+                                                 size_t num) {
+    VLOG(0) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }
 
   virtual std::future<int32_t> print_table_stat(uint32_t table_id) = 0;
 
@@ -154,12 +161,13 @@ class PSClient {
   virtual std::future<int32_t> send_client2client_msg(int msg_type,
                                                       int to_client_id,
                                                       const std::string &msg) {
-    LOG(FATAL) << "Did not implement";
+    VLOG(0) << "Did not implement";
     std::promise<int32_t> promise;
     std::future<int> fut = promise.get_future();
     promise.set_value(-1);
     return fut;
   }
+
   // client2client消息处理，std::function<int32_t (int, int, const std::string&)
   // -> ret (msg_type, from_client_id, msg)
   typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;
diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/service/ps_local_client.cc
new file mode 100644
index 00000000000000..2acc845a50890b
--- /dev/null
+++ b/paddle/fluid/distributed/service/ps_local_client.cc
@@ -0,0 +1,269 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/ps_local_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+
+//#define pslib_debug_dense_compress
+
+namespace paddle {
+namespace distributed {
+int32_t PsLocalClient::initialize() {
+  const auto& downpour_param = _config.server_param().downpour_server_param();
+  TableManager::instance().initialize();
+  for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
+    auto* table = CREATE_PSCORE_CLASS(
+        Table, downpour_param.downpour_table_param(i).table_class());
+    table->initialize(downpour_param.downpour_table_param(i),
+                      _config.fs_client_param());
+    table->set_shard(0, 1);
+    _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table);
+  }
+  return 0;
+}
+
+::std::future<int32_t> PsLocalClient::shrink(uint32_t table_id,
+                                             const std::string threshold) {
+  // TODO
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::load(const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  // for (auto& it : _table_map) {
+  //    load(it.first, epoch, mode);
+  //}
+  return done();
+}
+::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
+                                           const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  // auto* table_ptr = table(table_id);
+  // table_ptr->load(epoch, mode);
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::save(const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  for (auto& it : _table_map) {
+    save(it.first, epoch, mode);
+  }
+  return done();
+}
+::std::future<int32_t> PsLocalClient::save(uint32_t table_id,
+                                           const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  auto* table_ptr = table(table_id);
+  table_ptr->flush();
+  table_ptr->save(epoch, mode);
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::clear() {
+  // TODO
+  return done();
+}
+::std::future<int32_t> PsLocalClient::clear(uint32_t table_id) {
+  // TODO
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::flush() {
+  // no need
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::stop_server() {
+  // no need
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::pull_dense(Region* regions,
+                                                 size_t region_num,
+                                                 size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  uint32_t num_per_shard = dense_dim_per_shard(accessor->fea_dim(), 1);
+  std::vector<float> region_buffer;
+  region_buffer.resize(num_per_shard);
+  table_ptr->pull_dense(region_buffer.data(), region_buffer.size());
+
+  size_t region_idx = 0;
+  size_t region_data_idx = 0;
+  size_t shard_data_size = num_per_shard;
+  size_t shard_buffer_remain = shard_data_size * sizeof(float);
+  PADDLE_ENFORCE_EQ(
+      shard_buffer_remain, region_buffer.size() * sizeof(float),
+      platform::errors::PreconditionNotMet("pull dense size error."));
+  size_t index = 0;
+  while (shard_buffer_remain > 0 && region_idx < region_num) {
+    auto& region = regions[region_idx];
+    if (region.size - region_data_idx >= shard_buffer_remain) {
+      memcpy((void*)(region.data + region_data_idx),
+             (uint8_t*)(void*)(region_buffer.data()) + index,
+             shard_buffer_remain);
+      region_data_idx += shard_buffer_remain;
+      shard_buffer_remain = 0;
+    } else if (region.size - region_data_idx == 0) {
+      ++region_idx;
+      region_data_idx = 0;
+    } else {
+      memcpy((void*)(region.data + region_data_idx),
+             (uint8_t*)(void*)(region_buffer.data()) + index,
+             region.size - region_data_idx);
+      shard_buffer_remain -= (region.size - region_data_idx);
+      index += (region.size - region_data_idx);
+      ++region_idx;
+      region_data_idx = 0;
+    }
+  }
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense_param(const Region* regions,
+                                                       size_t region_num,
+                                                       size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  std::vector<float> region_buffer;
+  region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1), 0);
+  for (size_t i = 0, offset = 0; i < region_num; ++i) {
+    uint32_t data_num = regions[i].size / sizeof(float);
+    memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size);
+    offset += data_num;
+  }
+
+  // table_ptr->push_dense_param(region_buffer.data(), region_buffer.size());
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense_raw_gradient(
+    int table_id, float* total_send_data, size_t total_send_data_size,
+    void* callback) {
+  VLOG(1) << "wxx push_dense_raw_gradient";
+
+  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
+
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_dense(total_send_data, total_send_data_size);
+  delete closure;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense(const Region* regions,
+                                                 size_t region_num,
+                                                 size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  std::vector<float> region_buffer;
+  region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1));
+  size_t data_size = region_buffer.size();
+  for (size_t i = 0, offset = 0; i < region_num; ++i) {
+    uint32_t data_num = regions[i].size / sizeof(float);
+    PADDLE_ENFORCE_LE(
+        offset + data_num, data_size,
+        platform::errors::PreconditionNotMet(
+            "invalid dense size, cur pos[%d] data_num[%d] size[%d]", offset,
+            data_num, data_size));
+    memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size);
+    offset += data_num;
+  }
+
+  table_ptr->push_dense(region_buffer.data(), region_buffer.size());
+
+  return done();
+}
+
+//::std::future<int32_t> PsLocalClient::pull_sparse(float** select_values,
+//                                                  size_t table_id,
+//                                                  const uint64_t* keys,
+//                                                  size_t num) {
+//  // FIXME
+//  // auto timer =
+//  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
+//  // auto local_timer =
+//  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse_local");
+//  //将key拆分到各shard请求，并记录原始对应value指针
+//  auto* accessor = table_accessor(table_id);
+//  auto* table_ptr = table(table_id);
+//  size_t value_size = accessor->select_size();
+//
+//  // table_ptr->pull_sparse(keys, num);
+//  std::vector<float> res_data;
+//  res_data.resize(num * value_size / sizeof(float));
+//  table_ptr->pull_sparse(res_data.data(), keys, num);
+//  // memcpy(select_values[0], res_data->data(), res_data->size() *
+//  // sizeof(float));
+//  size_t offset = 0;
+//  for (int i = 0; i < num; ++i) {
+//    memcpy(select_values[i], (char*)res_data.data() + offset, value_size);
+//    offset += value_size;
+//  }
+//
+//  // return fut;
+//  return done();
+//}
+
+::std::future<int32_t> PsLocalClient::pull_sparse_ptr(char** select_values,
+                                                      size_t table_id,
+                                                      const uint64_t* keys,
+                                                      size_t num) {
+  // FIXME
+  // auto timer =
+  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
+  // auto local_timer =
+  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse_local");
+  //将key拆分到各shard请求，并记录原始对应value指针
+  auto* table_ptr = table(table_id);
+
+  table_ptr->pull_sparse_ptr(select_values, keys, num);
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_sparse_raw_gradient(
+    size_t table_id, const uint64_t* keys, const float** update_values,
+    size_t num, void* callback) {
+  VLOG(1) << "wxx push_sparse_raw_gradient";
+  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_sparse(keys, update_values, num);
+  delete closure;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_sparse(size_t table_id,
+                                                  const uint64_t* keys,
+                                                  const float** update_values,
+                                                  size_t num) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_sparse(keys, update_values, num);
+  return done();
+}
+}
+}
diff --git a/paddle/fluid/distributed/service/ps_local_client.h b/paddle/fluid/distributed/service/ps_local_client.h
new file mode 100644
index 00000000000000..9d2b01a45fe929
--- /dev/null
+++ b/paddle/fluid/distributed/service/ps_local_client.h
@@ -0,0 +1,226 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License 0//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+
+namespace paddle {
+namespace distributed {
+
+class Table;
+
+class PsLocalClient : public PSClient {
+ public:
+  PsLocalClient() {}
+  virtual ~PsLocalClient() { _running = false; }
+  virtual int32_t create_client2client_connection(int pslib_timeout_ms,
+                                                  int pslib_connect_timeout_ms,
+                                                  int max_retry) {
+    return 0;
+  }
+
+  virtual ::std::future<int32_t> shrink(uint32_t table_id,
+                                        const std::string threshold) override;
+  virtual ::std::future<int32_t> load(const std::string& epoch,
+                                      const std::string& mode) override;
+  virtual ::std::future<int32_t> load(uint32_t table_id,
+                                      const std::string& epoch,
+                                      const std::string& mode) override;
+
+  virtual ::std::future<int32_t> save(const std::string& epoch,
+                                      const std::string& mode) override;
+  virtual ::std::future<int32_t> save(uint32_t table_id,
+                                      const std::string& epoch,
+                                      const std::string& mode) override;
+
+  virtual ::std::future<int32_t> clear() override;
+  virtual ::std::future<int32_t> clear(uint32_t table_id) override;
+
+  virtual ::std::future<int32_t> stop_server() override;
+
+  virtual void finalize_worker() override {}
+  virtual ::std::future<int32_t> pull_dense(Region* regions, size_t region_num,
+                                            size_t table_id);
+
+  virtual ::std::future<int32_t> push_dense(const Region* regions,
+                                            size_t region_num, size_t table_id);
+
+  virtual ::std::future<int32_t> push_dense_param(const Region* regions,
+                                                  size_t region_num,
+                                                  size_t table_id);
+
+  virtual ::std::future<int32_t> pull_sparse(float** select_values,
+                                             size_t table_id,
+                                             const uint64_t* keys, size_t num,
+                                             bool is_training) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual ::std::future<int32_t> pull_sparse_ptr(char** select_values,
+                                                 size_t table_id,
+                                                 const uint64_t* keys,
+                                                 size_t num);
+
+  virtual ::std::future<int32_t> print_table_stat(uint32_t table_id) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+  virtual ::std::future<int32_t> push_sparse(size_t table_id,
+                                             const uint64_t* keys,
+                                             const float** update_values,
+                                             size_t num);
+
+  virtual ::std::future<int32_t> flush();
+  // server profilera
+  virtual std::future<int32_t> start_profiler() {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  };
+
+  virtual std::future<int32_t> stop_profiler() {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> barrier(size_t table_id, uint32_t barrier_type) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> pull_geo_param(size_t table_id,
+                                              std::vector<float>* values,
+                                              std::vector<uint64_t>* keys,
+                                              int pserver_idx) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> push_global_step(int table_id,
+                                                int64_t* total_send_data,
+                                                void* done) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  // recv table from server and save it in LodTensor
+  virtual int32_t recv_and_save_table(const uint64_t table_id,
+                                      const std::string& path) {
+    return 0;
+  }
+
+  virtual ::std::future<int32_t> send_client2client_msg(
+      int msg_type, int to_client_id, const std::string& msg) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+  virtual size_t get_server_nums() { return 1; }
+
+  virtual std::future<int32_t> push_dense_raw_gradient(
+      int table_id, float* total_send_data, size_t total_send_data_size,
+      void* callback) override;
+
+  virtual std::future<int32_t> push_sparse_raw_gradient(
+      size_t table_id, const uint64_t* keys, const float** update_values,
+      size_t num, void* callback) override;
+
+  virtual std::future<int32_t> push_sparse_raw_gradient_partial(
+      size_t table_id, const uint64_t* keys, const float** update_values,
+      uint32_t num, void* done, int pserver_idx) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> push_sparse_param(size_t table_id,
+                                                 const uint64_t* keys,
+                                                 const float** update_values,
+                                                 size_t num,
+                                                 void* done) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+ private:
+  virtual int32_t initialize() override;
+
+  std::future<int32_t> done() {
+    std::shared_ptr<std::promise<int32_t>> prom =
+        std::make_shared<std::promise<int32_t>>();
+    std::future<int32_t> fut = prom->get_future();
+    prom->set_value(0);
+    return fut;
+  }
+
+  inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
+                                      uint32_t shard_num) {
+    return dense_dim_total / shard_num + 1;
+  }
+
+  inline std::unordered_map<uint32_t, std::shared_ptr<Table>>* table() {
+    return &_table_map;
+  }
+
+  inline Table* table(size_t table_id) {
+    auto itr = _table_map.find(table_id);
+    if (itr != _table_map.end()) {
+      return itr->second.get();
+    }
+    LOG(ERROR) << "table not found " << table_id;
+    return NULL;
+  }
+
+  std::unordered_map<uint32_t, std::shared_ptr<Table>> _table_map;
+
+  bool _running = false;
+  bool _flushing = false;
+
+ private:
+  float _mae = 0;
+  float _mse = 0;
+  uint16_t _push_times = 0;
+};
+}
+}
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/distributed/service/ps_local_server.h
similarity index 56%
rename from paddle/fluid/operators/distributed/parameter_send.h
rename to paddle/fluid/distributed/service/ps_local_server.h
index 4335ef8c73cc0a..dfbccc70900e3c 100644
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/distributed/service/ps_local_server.h
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,22 +14,24 @@
 
 #pragma once
 
-#include <string>
+#include <memory>
 #include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
+#include "paddle/fluid/distributed/service/server.h"
 
 namespace paddle {
-namespace operators {
 namespace distributed {
 
-template <typename T>
-struct ParameterSend {
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope,
-                  bool sync, int multi_parts);
-};
+class PsLocalServer : public PSServer {
+ public:
+  PsLocalServer() {}
+  virtual ~PsLocalServer() {}
+  virtual uint64_t start() { return 0; }
+  virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; }
+  virtual int32_t stop() { return 0; }
+  virtual int32_t port() { return 0; }
 
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
+ private:
+  virtual int32_t initialize() { return 0; }
+};
+}
+}
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto
index 6250f84c98754d..d908c26da9870a 100644
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -48,6 +48,10 @@ enum PsCmdID {
   PS_START_PROFILER = 27;
   PS_STOP_PROFILER = 28;
   PS_PUSH_GLOBAL_STEP = 29;
+  PS_PULL_GRAPH_LIST = 30;
+  PS_GRAPH_SAMPLE_NEIGHBOORS = 31;
+  PS_GRAPH_SAMPLE_NODES = 32;
+  PS_GRAPH_GET_NODE_FEAT = 33;
 }
 
 message PsRequestMessage {
@@ -111,4 +115,4 @@ message MultiVariableMessage {
 service PsService {
   rpc service(PsRequestMessage) returns (PsResponseMessage);
   rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage);
-};
\ No newline at end of file
+};
diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/service/server.cc
index fc230a0b9c92e6..e44876e3d2b789 100644
--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/service/server.cc
@@ -16,13 +16,18 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/ps_local_server.h"
 #include "paddle/fluid/distributed/table/table.h"
 
 namespace paddle {
 namespace distributed {
 
 REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer);
+REGISTER_PSCORE_CLASS(PSServer, PsLocalServer);
 REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService);
+REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer);
+REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService);
 
 PSServer *PSServerFactory::create(const PSParameter &ps_config) {
   const auto &config = ps_config.server_param();
diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc
index 3d0f94fac27750..2759e4614e66e1 100644
--- a/paddle/fluid/distributed/service/service.cc
+++ b/paddle/fluid/distributed/service/service.cc
@@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt(
 }
 
 void PSCore::init_gflag(const std::string& gflags) {
-  LOG(INFO) << "Init With Gflags:" << gflags;
+  VLOG(3) << "Init With Gflags:" << gflags;
   std::vector<std::string> flags = paddle::string::split_string(gflags);
   if (flags.size() < 1) {
     flags.push_back("-max_body_size=314217728");
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index 1e98e193d54ae6..dde1f5ae8ee3a1 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -1,13 +1,19 @@
 set_property(GLOBAL PROPERTY TABLE_DEPS string_helper)
-
+set(graphDir graph)
 get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS)
-
+set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc)
+set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge)
+set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler)
 set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc DEPS ${TABLE_DEPS} device_context string_helper simple_threadpool xxhash generator)
+cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
 
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
new file mode 100644
index 00000000000000..020bcdcc52ef4b
--- /dev/null
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -0,0 +1,506 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include <time.h>
+#include <algorithm>
+#include <set>
+#include <sstream>
+#include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+
+std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
+  if (start < 0) start = 0;
+  std::vector<Node *> res;
+  for (int pos = start; pos < std::min(end, (int)bucket.size()); pos += step) {
+    res.push_back(bucket[pos]);
+  }
+  return res;
+}
+
+size_t GraphShard::get_size() { return bucket.size(); }
+
+GraphNode *GraphShard::add_graph_node(uint64_t id) {
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(new GraphNode(id));
+  }
+  return (GraphNode *)bucket[node_location[id]];
+}
+
+FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(new FeatureNode(id));
+  }
+  return (FeatureNode *)bucket[node_location[id]];
+}
+
+void GraphShard::add_neighboor(uint64_t id, uint64_t dst_id, float weight) {
+  find_node(id)->add_edge(dst_id, weight);
+}
+
+Node *GraphShard::find_node(uint64_t id) {
+  auto iter = node_location.find(id);
+  return iter == node_location.end() ? nullptr : bucket[iter->second];
+}
+
+int32_t GraphTable::load(const std::string &path, const std::string &param) {
+  bool load_edge = (param[0] == 'e');
+  bool load_node = (param[0] == 'n');
+  if (load_edge) {
+    bool reverse_edge = (param[1] == '<');
+    return this->load_edges(path, reverse_edge);
+  }
+  if (load_node) {
+    std::string node_type = param.substr(1);
+    return this->load_nodes(path, node_type);
+  }
+  return 0;
+}
+
+int32_t GraphTable::get_nodes_ids_by_ranges(
+    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+  int start = 0, end, index = 0, total_size = 0;
+  res.clear();
+  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  // std::string temp = "";
+  // for(int i = 0;i < shards.size();i++)
+  //   temp+= std::to_string((int)shards[i].get_size()) + " ";
+  // VLOG(0)<<"range distribution "<<temp;
+  for (int i = 0; i < shards.size() && index < ranges.size(); i++) {
+    end = total_size + shards[i].get_size();
+    start = total_size;
+    while (start < end && index < ranges.size()) {
+      if (ranges[index].second <= start)
+        index++;
+      else if (ranges[index].first >= end) {
+        break;
+      } else {
+        int first = std::max(ranges[index].first, start);
+        int second = std::min(ranges[index].second, end);
+        start = second;
+        first -= total_size;
+        second -= total_size;
+        // VLOG(0)<<" FIND RANGE "<<i<<" "<<first<<" "<<second;
+        tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+            [this, first, second, i]() -> std::vector<uint64_t> {
+              return shards[i].get_ids_by_range(first, second);
+            }));
+      }
+    }
+    total_size += shards[i].get_size();
+  }
+  for (int i = 0; i < tasks.size(); i++) {
+    auto vec = tasks[i].get();
+    for (auto &id : vec) {
+      res.push_back(id);
+      std::swap(res[rand() % res.size()], res[(int)res.size() - 1]);
+    }
+  }
+  return 0;
+}
+
+int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  int64_t valid_count = 0;
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      count++;
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      if (values.size() < 2) continue;
+      auto id = std::stoull(values[1]);
+
+      size_t shard_id = id % shard_num;
+      if (shard_id >= shard_end || shard_id < shard_start) {
+        VLOG(4) << "will not load " << id << " from " << path
+                << ", please check id distribution";
+        continue;
+      }
+
+      if (count % 1000000 == 0) {
+        VLOG(0) << count << " nodes are loaded from filepath";
+      }
+
+      std::string nt = values[0];
+      if (nt != node_type) {
+        continue;
+      }
+
+      size_t index = shard_id - shard_start;
+
+      auto node = shards[index].add_feature_node(id);
+
+      node->set_feature_size(feat_name.size());
+
+      for (size_t slice = 2; slice < values.size(); slice++) {
+        auto feat = this->parse_feature(values[slice]);
+        if (feat.first >= 0) {
+          node->set_feature(feat.first, feat.second);
+        } else {
+          VLOG(4) << "Node feature:  " << values[slice]
+                  << " not in feature_map.";
+        }
+      }
+      valid_count++;
+    }
+  }
+
+  VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type
+          << " are loaded successfully in " << path;
+  return 0;
+}
+
+int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int count = 0;
+  std::string sample_type = "random";
+  bool is_weighted = false;
+  int valid_count = 0;
+
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      count++;
+      if (values.size() < 2) continue;
+      auto src_id = std::stoull(values[0]);
+      auto dst_id = std::stoull(values[1]);
+      if (reverse_edge) {
+        std::swap(src_id, dst_id);
+      }
+      float weight = 1;
+      if (values.size() == 3) {
+        weight = std::stof(values[2]);
+        sample_type = "weighted";
+        is_weighted = true;
+      }
+
+      size_t src_shard_id = src_id % shard_num;
+
+      if (src_shard_id >= shard_end || src_shard_id < shard_start) {
+        VLOG(4) << "will not load " << src_id << " from " << path
+                << ", please check id distribution";
+        continue;
+      }
+      if (count % 1000000 == 0) {
+        VLOG(0) << count << " edges are loaded from filepath";
+      }
+
+      size_t index = src_shard_id - shard_start;
+      shards[index].add_graph_node(src_id)->build_edges(is_weighted);
+      shards[index].add_neighboor(src_id, dst_id, weight);
+      valid_count++;
+    }
+  }
+  VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
+          << path;
+
+  // Build Sampler j
+
+  for (auto &shard : shards) {
+    auto bucket = shard.get_bucket();
+    for (int i = 0; i < bucket.size(); i++) {
+      bucket[i]->build_sampler(sample_type);
+    }
+  }
+  return 0;
+}
+
+Node *GraphTable::find_node(uint64_t id) {
+  size_t shard_id = id % shard_num;
+  if (shard_id >= shard_end || shard_id < shard_start) {
+    return nullptr;
+  }
+  size_t index = shard_id - shard_start;
+  Node *node = shards[index].find_node(id);
+  return node;
+}
+uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+  return node_id % shard_num % shard_num_per_table % task_pool_size_;
+}
+int32_t GraphTable::random_sample_nodes(int sample_size,
+                                        std::unique_ptr<char[]> &buffer,
+                                        int &actual_size) {
+  bool need_feature = false;
+  int total_size = 0;
+  for (int i = 0; i < shards.size(); i++) {
+    total_size += shards[i].get_size();
+  }
+  if (sample_size > total_size) sample_size = total_size;
+  int range_num = random_sample_nodes_ranges;
+  if (range_num > sample_size) range_num = sample_size;
+  if (sample_size == 0 || range_num == 0) return 0;
+  std::vector<int> ranges_len, ranges_pos;
+  int remain = sample_size, last_pos = -1, num;
+  std::set<int> separator_set;
+  for (int i = 0; i < range_num - 1; i++) {
+    while (separator_set.find(num = rand() % (sample_size - 1)) !=
+           separator_set.end())
+      ;
+    separator_set.insert(num);
+  }
+  for (auto p : separator_set) {
+    ranges_len.push_back(p - last_pos);
+    last_pos = p;
+  }
+  ranges_len.push_back(sample_size - 1 - last_pos);
+  remain = total_size - sample_size + range_num;
+  separator_set.clear();
+  for (int i = 0; i < range_num; i++) {
+    while (separator_set.find(num = rand() % remain) != separator_set.end())
+      ;
+    separator_set.insert(num);
+  }
+  int used = 0, index = 0;
+  last_pos = -1;
+  for (auto p : separator_set) {
+    used += p - last_pos - 1;
+    last_pos = p;
+    ranges_pos.push_back(used);
+    used += ranges_len[index++];
+  }
+  std::vector<std::pair<int, int>> first_half, second_half;
+  int start_index = rand() % total_size;
+  for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) {
+    if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size)
+      first_half.push_back({ranges_pos[i] + start_index,
+                            ranges_pos[i] + ranges_len[i] + start_index});
+    else if (ranges_pos[i] + start_index >= total_size) {
+      second_half.push_back(
+          {ranges_pos[i] + start_index - total_size,
+           ranges_pos[i] + ranges_len[i] + start_index - total_size});
+    } else {
+      first_half.push_back({ranges_pos[i] + start_index, total_size});
+      second_half.push_back(
+          {0, ranges_pos[i] + ranges_len[i] + start_index - total_size});
+    }
+  }
+  for (auto &pair : first_half) second_half.push_back(pair);
+  std::vector<uint64_t> res;
+  get_nodes_ids_by_ranges(second_half, res);
+  actual_size = res.size() * sizeof(uint64_t);
+  buffer.reset(new char[actual_size]);
+  char *pointer = buffer.get();
+  memcpy(pointer, res.data(), actual_size);
+  return 0;
+}
+int32_t GraphTable::random_sample_neighboors(
+    uint64_t *node_ids, int sample_size,
+    std::vector<std::unique_ptr<char[]>> &buffers,
+    std::vector<int> &actual_sizes) {
+  size_t node_num = buffers.size();
+  std::vector<std::future<int>> tasks;
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    uint64_t &node_id = node_ids[idx];
+    std::unique_ptr<char[]> &buffer = buffers[idx];
+    int &actual_size = actual_sizes[idx];
+    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
+        [&]() -> int {
+          Node *node = find_node(node_id);
+
+          if (node == nullptr) {
+            actual_size = 0;
+            return 0;
+          }
+          std::vector<int> res = node->sample_k(sample_size);
+          actual_size = res.size() * (Node::id_size + Node::weight_size);
+          int offset = 0;
+          uint64_t id;
+          float weight;
+          char *buffer_addr = new char[actual_size];
+          buffer.reset(buffer_addr);
+          for (int &x : res) {
+            id = node->get_neighbor_id(x);
+            weight = node->get_neighbor_weight(x);
+            memcpy(buffer_addr + offset, &id, Node::id_size);
+            offset += Node::id_size;
+            memcpy(buffer_addr + offset, &weight, Node::weight_size);
+            offset += Node::weight_size;
+          }
+          return 0;
+        }));
+  }
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    tasks[idx].get();
+  }
+  return 0;
+}
+
+int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+                                  const std::vector<std::string> &feature_names,
+                                  std::vector<std::vector<std::string>> &res) {
+  size_t node_num = node_ids.size();
+  std::vector<std::future<int>> tasks;
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    uint64_t node_id = node_ids[idx];
+    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
+        [&, idx, node_id]() -> int {
+          Node *node = find_node(node_id);
+
+          if (node == nullptr) {
+            return 0;
+          }
+          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+            const std::string &feature_name = feature_names[feat_idx];
+            if (feat_id_map.find(feature_name) != feat_id_map.end()) {
+              // res[feat_idx][idx] =
+              // node->get_feature(feat_id_map[feature_name]);
+              auto feat = node->get_feature(feat_id_map[feature_name]);
+              res[feat_idx][idx] = feat;
+            }
+          }
+          return 0;
+        }));
+  }
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    tasks[idx].get();
+  }
+  return 0;
+}
+
+std::pair<int32_t, std::string> GraphTable::parse_feature(
+    std::string feat_str) {
+  // Return (feat_id, btyes) if name are in this->feat_name, else return (-1,
+  // "")
+  auto fields = paddle::string::split_string<std::string>(feat_str, " ");
+  if (this->feat_id_map.count(fields[0])) {
+    int32_t id = this->feat_id_map[fields[0]];
+    std::string dtype = this->feat_dtype[id];
+    int32_t shape = this->feat_shape[id];
+    std::vector<std::string> values(fields.begin() + 1, fields.end());
+    if (dtype == "feasign") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), paddle::string::join_strings(values, ' '));
+    } else if (dtype == "string") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), paddle::string::join_strings(values, ' '));
+    } else if (dtype == "float32") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<float>(values));
+    } else if (dtype == "float64") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<double>(values));
+    } else if (dtype == "int32") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<int32_t>(values));
+    } else if (dtype == "int64") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<int64_t>(values));
+    }
+  }
+  return std::make_pair<int32_t, std::string>(-1, "");
+}
+
+int32_t GraphTable::pull_graph_list(int start, int total_size,
+                                    std::unique_ptr<char[]> &buffer,
+                                    int &actual_size, bool need_feature,
+                                    int step) {
+  if (start < 0) start = 0;
+  int size = 0, cur_size;
+  std::vector<std::future<std::vector<Node *>>> tasks;
+  for (size_t i = 0; i < shards.size() && total_size > 0; i++) {
+    cur_size = shards[i].get_size();
+    if (size + cur_size <= start) {
+      size += cur_size;
+      continue;
+    }
+    int count = std::min(1 + (size + cur_size - start - 1) / step, total_size);
+    int end = start + (count - 1) * step + 1;
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [this, i, start, end, step, size]() -> std::vector<Node *> {
+
+          return this->shards[i].get_batch(start - size, end - size, step);
+        }));
+    start += count * step;
+    total_size -= count;
+    size += cur_size;
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  size = 0;
+  std::vector<std::vector<Node *>> res;
+  for (size_t i = 0; i < tasks.size(); i++) {
+    res.push_back(tasks[i].get());
+    for (size_t j = 0; j < res.back().size(); j++) {
+      size += res.back()[j]->get_size(need_feature);
+    }
+  }
+  char *buffer_addr = new char[size];
+  buffer.reset(buffer_addr);
+  int index = 0;
+  for (size_t i = 0; i < res.size(); i++) {
+    for (size_t j = 0; j < res[i].size(); j++) {
+      res[i][j]->to_buffer(buffer_addr + index, need_feature);
+      index += res[i][j]->get_size(need_feature);
+    }
+  }
+  actual_size = size;
+  return 0;
+}
+int32_t GraphTable::initialize() {
+  _shards_task_pool.resize(task_pool_size_);
+  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
+  server_num = _shard_num;
+  // VLOG(0) << "in init graph table server num = " << server_num;
+  /*
+  _shard_num is actually server number here
+  when a server initialize its tables, it sets tables' _shard_num to server_num,
+  and _shard_idx to server
+  rank
+  */
+  auto common = _config.common();
+
+  this->table_name = common.table_name();
+  this->table_type = common.name();
+  VLOG(0) << " init graph table type " << this->table_type << " table name "
+          << this->table_name;
+  int feat_conf_size = static_cast<int>(common.attributes().size());
+  for (int i = 0; i < feat_conf_size; i++) {
+    auto &f_name = common.attributes()[i];
+    auto &f_shape = common.dims()[i];
+    auto &f_dtype = common.params()[i];
+    this->feat_name.push_back(f_name);
+    this->feat_shape.push_back(f_shape);
+    this->feat_dtype.push_back(f_dtype);
+    this->feat_id_map[f_name] = i;
+    VLOG(0) << "init graph table feat conf name:" << f_name
+            << " shape:" << f_shape << " dtype:" << f_dtype;
+  }
+
+  shard_num = _config.shard_num();
+  VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
+          << _shard_idx;
+  shard_num_per_table = sparse_local_shard_num(shard_num, server_num);
+  shard_start = _shard_idx * shard_num_per_table;
+  shard_end = shard_start + shard_num_per_table;
+  VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
+          << shard_start << " shard_end " << shard_end;
+  // shards.resize(shard_num_per_table);
+  shards = std::vector<GraphShard>(shard_num_per_table, GraphShard(shard_num));
+  return 0;
+}
+}
+};
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
new file mode 100644
index 00000000000000..8ddf3c8f904a6c
--- /dev/null
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <assert.h>
+#include <pthread.h>
+#include <list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/common_table.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+class GraphShard {
+ public:
+  // static int bucket_low_bound;
+  // static int gcd(int s, int t) {
+  //   if (s % t == 0) return t;
+  //   return gcd(t, s % t);
+  // }
+  size_t get_size();
+  GraphShard() {}
+  GraphShard(int shard_num) {
+    this->shard_num = shard_num;
+    // bucket_size = init_bucket_size(shard_num);
+    // bucket.resize(bucket_size);
+  }
+  std::vector<Node *> &get_bucket() { return bucket; }
+  std::vector<Node *> get_batch(int start, int end, int step);
+  // int init_bucket_size(int shard_num) {
+  //   for (int i = bucket_low_bound;; i++) {
+  //     if (gcd(i, shard_num) == 1) return i;
+  //   }
+  //   return -1;
+  // }
+  std::vector<uint64_t> get_ids_by_range(int start, int end) {
+    std::vector<uint64_t> res;
+    for (int i = start; i < end && i < bucket.size(); i++) {
+      res.push_back(bucket[i]->get_id());
+    }
+    return res;
+  }
+  GraphNode *add_graph_node(uint64_t id);
+  FeatureNode *add_feature_node(uint64_t id);
+  Node *find_node(uint64_t id);
+  void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
+  // std::unordered_map<uint64_t, std::list<GraphNode *>::iterator>
+  std::unordered_map<uint64_t, int> get_node_location() {
+    return node_location;
+  }
+
+ private:
+  std::unordered_map<uint64_t, int> node_location;
+  int shard_num;
+  std::vector<Node *> bucket;
+};
+class GraphTable : public SparseTable {
+ public:
+  GraphTable() {}
+  virtual ~GraphTable() {}
+  virtual int32_t pull_graph_list(int start, int size,
+                                  std::unique_ptr<char[]> &buffer,
+                                  int &actual_size, bool need_feature,
+                                  int step);
+
+  virtual int32_t random_sample_neighboors(
+      uint64_t *node_ids, int sample_size,
+      std::vector<std::unique_ptr<char[]>> &buffers,
+      std::vector<int> &actual_sizes);
+
+  int32_t random_sample_nodes(int sample_size, std::unique_ptr<char[]> &buffers,
+                              int &actual_sizes);
+
+  virtual int32_t get_nodes_ids_by_ranges(
+      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
+  virtual int32_t initialize();
+
+  int32_t load(const std::string &path, const std::string &param);
+
+  int32_t load_edges(const std::string &path, bool reverse);
+
+  int32_t load_nodes(const std::string &path, std::string node_type);
+
+  Node *find_node(uint64_t id);
+
+  virtual int32_t pull_sparse(float *values,
+                              const PullSparseValue &pull_value) {
+    return 0;
+  }
+
+  virtual int32_t push_sparse(const uint64_t *keys, const float *values,
+                              size_t num) {
+    return 0;
+  }
+
+  virtual void clear() {}
+  virtual int32_t flush() { return 0; }
+  virtual int32_t shrink(const std::string &param) { return 0; }
+  //指定保存路径
+  virtual int32_t save(const std::string &path, const std::string &converter) {
+    return 0;
+  }
+  virtual int32_t initialize_shard() { return 0; }
+  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
+
+  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+                                const std::vector<std::string> &feature_names,
+                                std::vector<std::vector<std::string>> &res);
+
+ protected:
+  std::vector<GraphShard> shards;
+  size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num;
+  const int task_pool_size_ = 11;
+  const int random_sample_nodes_ranges = 3;
+
+  std::vector<std::string> feat_name;
+  std::vector<std::string> feat_dtype;
+  std::vector<int32_t> feat_shape;
+  std::unordered_map<std::string, int32_t> feat_id_map;
+  std::string table_name;
+  std::string table_type;
+
+  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
+};
+}  // namespace distributed
+};  // namespace paddle
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index ffedbea14a0290..2e8c257b6aad47 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -254,7 +254,6 @@ int32_t CommonSparseTable::initialize_value() {
   }
 
   auto accessor = _config.accessor();
-
   std::vector<uint64_t> feasigns;
 
   for (size_t x = 0; x < accessor.fea_dim(); ++x) {
@@ -271,9 +270,14 @@ int32_t CommonSparseTable::initialize_value() {
     std::vector<uint64_t> ids(bucket_feasigns);
     std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1],
               ids.begin());
+
+    std::vector<uint32_t> fres;
+    fres.resize(ids.size(), 1);
+
+    auto pull_value = PullSparseValue(ids, fres, param_dim_);
     std::vector<float> pulls;
     pulls.resize(bucket_feasigns * param_dim_);
-    pull_sparse(pulls.data(), ids.data(), bucket_feasigns);
+    pull_sparse(pulls.data(), pull_value);
   }
 
   return 0;
@@ -399,10 +403,51 @@ int32_t CommonSparseTable::pour() {
   return 0;
 }
 
-int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys,
-                                       size_t num) {
+int32_t CommonSparseTable::pull_sparse(float* pull_values,
+                                       const PullSparseValue& pull_value) {
   rwlock_->RDLock();
 
+  auto shard_num = task_pool_size_;
+  std::vector<std::future<int>> tasks(shard_num);
+
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, shard_num, &pull_value, &pull_values]() -> int {
+          auto& block = shard_values_[shard_id];
+
+          std::vector<int> offsets;
+          pull_value.Fission(shard_id, shard_num, &offsets);
+
+          if (pull_value.is_training_) {
+            for (auto& offset : offsets) {
+              auto feasign = pull_value.feasigns_[offset];
+              auto frequencie = pull_value.frequencies_[offset];
+              auto* value = block->Init(feasign, true, frequencie);
+              std::copy_n(value + param_offset_, param_dim_,
+                          pull_values + param_dim_ * offset);
+            }
+          } else {
+            for (auto& offset : offsets) {
+              auto feasign = pull_value.feasigns_[offset];
+              auto* value = block->Init(feasign, false);
+              std::copy_n(value + param_offset_, param_dim_,
+                          pull_values + param_dim_ * offset);
+            }
+          }
+
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  rwlock_->UNLock();
+  return 0;
+}
+
+int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values,
+                                           const uint64_t* keys, size_t num) {
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(task_pool_size_);
 
@@ -422,9 +467,10 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys,
           for (int i = 0; i < offsets.size(); ++i) {
             auto offset = offsets[i];
             auto id = keys[offset];
-            auto* value = block->Init(id);
-            std::copy_n(value + param_offset_, param_dim_,
-                        pull_values + param_dim_ * offset);
+            auto* value = block->InitGet(id);
+            // std::copy_n(value + param_offset_, param_dim_,
+            //            pull_values + param_dim_ * offset);
+            pull_values[offset] = (char*)value;
           }
 
           return 0;
@@ -434,7 +480,6 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
@@ -494,6 +539,45 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys,
   return 0;
 }
 
+int32_t CommonSparseTable::push_sparse(const uint64_t* keys,
+                                       const float** values, size_t num) {
+  _push_sparse(keys, values, num);
+  return 0;
+}
+
+int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
+                                        const float** values, size_t num) {
+  rwlock_->RDLock();
+  std::vector<std::vector<uint64_t>> offset_bucket;
+  offset_bucket.resize(task_pool_size_);
+
+  for (int x = 0; x < num; ++x) {
+    auto y = keys[x] % task_pool_size_;
+    offset_bucket[y].push_back(x);
+  }
+
+  std::vector<std::future<int>> tasks(task_pool_size_);
+
+  for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &keys, &values, num, &offset_bucket]() -> int {
+          auto& offsets = offset_bucket[shard_id];
+          for (size_t i = 0; i < offsets.size(); ++i) {
+            std::vector<uint64_t> tmp_off = {0};
+            optimizer_->update(keys + offsets[i], values[offsets[i]], num,
+                               tmp_off, shard_values_[shard_id].get());
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  rwlock_->UNLock();
+  return 0;
+}
+
 int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys,
                                              const float* values, size_t num) {
   rwlock_->RDLock();
diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h
index 98cbf2b4a21057..50c295da53464c 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/table/common_sparse_table.h
@@ -61,12 +61,17 @@ class CommonSparseTable : public SparseTable {
   int32_t save(const std::string& path, const std::string& param);
 
   virtual std::pair<int64_t, int64_t> print_table_stat();
-  virtual int32_t pull_sparse(float* pull_values, const uint64_t* keys,
-                              size_t num);
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
+
+  virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
+                                  size_t num);
 
   virtual int32_t push_sparse(const uint64_t* keys, const float* values,
                               size_t num);
 
+  virtual int32_t push_sparse(const uint64_t* keys, const float** values,
+                              size_t num);
+
   // only for sparse geo table
   virtual int32_t push_sparse_param(const uint64_t* keys, const float* values,
                                     size_t num);
@@ -81,6 +86,8 @@ class CommonSparseTable : public SparseTable {
  protected:
   virtual int32_t _push_sparse(const uint64_t* keys, const float* values,
                                size_t num);
+  virtual int32_t _push_sparse(const uint64_t* keys, const float** values,
+                               size_t num);
 
  private:
   const int task_pool_size_ = 11;
diff --git a/paddle/fluid/distributed/table/common_table.h b/paddle/fluid/distributed/table/common_table.h
index dc3cfa75ff6898..bc7f17f5f24579 100644
--- a/paddle/fluid/distributed/table/common_table.h
+++ b/paddle/fluid/distributed/table/common_table.h
@@ -98,8 +98,8 @@ class DenseTable : public Table {
   virtual ~DenseTable() {}
 
   virtual void *get_shard(size_t shard_idx) { return 0; }
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -123,8 +123,8 @@ class BarrierTable : public Table {
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h
index a2acdfd20148ac..8079003d1bf8f6 100644
--- a/paddle/fluid/distributed/table/depends/dense.h
+++ b/paddle/fluid/distributed/table/depends/dense.h
@@ -89,7 +89,6 @@ class DSGD : public DenseOptimizer {
 
     auto blas = GetBlas<float>();
     float lr = *(global_learning_rate_) * (*learning_rate);
-    VLOG(4) << "DSGD LearningRate: " << lr;
     blas.VCOPY(update_numel, update_values + begin, grads.data());
     blas.SCAL(update_numel, lr, grads.data());
     blas.VSUB(update_numel, param + begin, grads.data(), param + begin);
@@ -157,7 +156,6 @@ class DAdam : public DenseOptimizer {
     beta2_pow[0] = beta2_pow[0] * beta2;
 
     float lr_ = *(global_learning_rate_)*learning_rate[0];
-    VLOG(4) << "DAdam LearningRate: " << lr_;
     lr_ *= sqrt(1 - beta2_pow[0]) / (1 - beta1_pow[0]);
 
     float* tmp_ = tmp.data();
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index ba79a381a6d881..68d252661edd53 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -87,7 +87,7 @@ class ValueBlock {
         value_dims_(value_dims),
         value_offsets_(value_offsets),
         value_idx_(value_idx) {
-    for (int x = 0; x < value_dims.size(); ++x) {
+    for (size_t x = 0; x < value_dims.size(); ++x) {
       value_length_ += value_dims[x];
     }
 
@@ -96,13 +96,15 @@ class ValueBlock {
       auto slices = string::split_string<std::string>(entry_attr, ":");
       if (slices[0] == "none") {
         entry_func_ = std::bind(&count_entry, std::placeholders::_1, 0);
+        threshold_ = 0;
       } else if (slices[0] == "count_filter_entry") {
-        int threshold = std::stoi(slices[1]);
-        entry_func_ = std::bind(&count_entry, std::placeholders::_1, threshold);
+        threshold_ = std::stoi(slices[1]);
+        entry_func_ =
+            std::bind(&count_entry, std::placeholders::_1, threshold_);
       } else if (slices[0] == "probability_entry") {
-        float threshold = std::stof(slices[1]);
+        threshold_ = std::stof(slices[1]);
         entry_func_ =
-            std::bind(&probility_entry, std::placeholders::_1, threshold);
+            std::bind(&probility_entry, std::placeholders::_1, threshold_);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Not supported Entry Type : %s, Only support [CountFilterEntry, "
@@ -155,7 +157,8 @@ class ValueBlock {
   }
 
   // pull
-  float *Init(const uint64_t &id, const bool with_update = true) {
+  float *Init(const uint64_t &id, const bool with_update = true,
+              const int counter = 1) {
     if (!Has(id)) {
       values_[id] = std::make_shared<VALUE>(value_length_);
     }
@@ -163,22 +166,37 @@ class ValueBlock {
     auto &value = values_.at(id);
 
     if (with_update) {
-      AttrUpdate(value);
+      AttrUpdate(value, counter);
     }
 
     return value->data_.data();
   }
 
-  void AttrUpdate(std::shared_ptr<VALUE> value) {
+  VALUE *InitGet(const uint64_t &id, const bool with_update = true,
+                 const int counter = 1) {
+    if (!Has(id)) {
+      values_[id] = std::make_shared<VALUE>(value_length_);
+    }
+
+    auto &value = values_.at(id);
+
+    if (with_update) {
+      AttrUpdate(value, counter);
+    }
+
+    return value.get();
+  }
+
+  void AttrUpdate(std::shared_ptr<VALUE> value, const int counter) {
     // update state
     value->unseen_days_ = 0;
-    ++value->count_;
+    value->count_ += counter;
 
     if (!value->is_entry_) {
       value->is_entry_ = entry_func_(value);
       if (value->is_entry_) {
         // initialize
-        for (int x = 0; x < value_names_.size(); ++x) {
+        for (size_t x = 0; x < value_names_.size(); ++x) {
           initializers_[x]->GetValue(value->data_.data() + value_offsets_[x],
                                      value_dims_[x]);
         }
@@ -223,6 +241,8 @@ class ValueBlock {
     return;
   }
 
+  float GetThreshold() { return threshold_; }
+
  private:
   bool Has(const uint64_t id) {
     auto got = values_.find(id);
@@ -245,6 +265,7 @@ class ValueBlock {
 
   std::function<bool(std::shared_ptr<VALUE>)> entry_func_;
   std::vector<std::shared_ptr<Initializer>> initializers_;
+  float threshold_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/table/depends/sparse.h
index 672d6e7d396874..0e1d7ef03c129c 100644
--- a/paddle/fluid/distributed/table/depends/sparse.h
+++ b/paddle/fluid/distributed/table/depends/sparse.h
@@ -110,7 +110,6 @@ class SSGD : public SparseOptimizer {
       auto* value = block->Get(id);
 
       float learning_rate = *(global_learning_rate_) * (value + lr_offset)[0];
-      VLOG(4) << "SSGD LearningRate: " << learning_rate;
       float* param = value + param_offset;
 
       std::vector<float> grads;
@@ -166,7 +165,6 @@ class SAdam : public SparseOptimizer {
       if (!block->GetEntry(id)) continue;
       auto* values = block->Get(id);
       float lr_ = *(global_learning_rate_) * (values + lr_offset)[0];
-      VLOG(4) << "SAdam LearningRate: " << lr_;
       float* param = values + param_offset;
       float* moment1 = values + m1_offset;
       float* moment2 = values + m2_offset;
diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h
new file mode 100644
index 00000000000000..c185dd17d792e4
--- /dev/null
+++ b/paddle/fluid/distributed/table/depends/sparse_utils.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace distributed {
+
+struct PullSparseValue {
+  explicit PullSparseValue(int numel, int dim)
+      : numel_(numel),
+        dim_(dim),
+        is_training_(true),
+        feasigns_(nullptr),
+        frequencies_(nullptr) {}
+
+  explicit PullSparseValue(std::vector<uint64_t> feasigns,
+                           std::vector<uint32_t> frequencies, int dim) {
+    numel_ = feasigns.size();
+    dim_ = dim;
+    is_training_ = true;
+    feasigns_ = feasigns.data();
+    frequencies_ = frequencies.data();
+  }
+
+  void DeserializeFromBytes(void* bytes) {
+    /*
+    |---isTraining--------------|
+    |---8*{num}B(keysData)------|
+    |---4*{num}B(Frequencies)---|
+    */
+    auto* begin = reinterpret_cast<char*>(bytes);
+    is_training_ = reinterpret_cast<bool*>(begin)[0];
+    feasigns_ = reinterpret_cast<uint64_t*>(begin + sizeof(bool));
+    frequencies_ = reinterpret_cast<uint32_t*>(begin + sizeof(bool) +
+                                               sizeof(uint64_t) * numel_);
+  }
+
+  void Fission(const int shard_id, const int shard_num,
+               std::vector<int>* offset_shard) const {
+    offset_shard->reserve(numel_ / shard_num + 1);
+    for (int x = 0; x < numel_; ++x) {
+      if (feasigns_[x] % shard_num == shard_id) {
+        offset_shard->push_back(x);
+      }
+    }
+  }
+
+  int numel_;
+  int dim_;
+  bool is_training_;
+  uint64_t* feasigns_;
+  uint32_t* frequencies_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/distributed/table/graph/graph_edge.cc
similarity index 59%
rename from paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
rename to paddle/fluid/distributed/table/graph/graph_edge.cc
index 3f3b6b959e3019..0ab0d5a76d6715 100644
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
+++ b/paddle/fluid/distributed/table/graph/graph_edge.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,16 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-
+#include "paddle/fluid/distributed/table/graph/graph_edge.h"
+#include <cstring>
 namespace paddle {
-namespace operators {
 namespace distributed {
 
-std::once_flag AsyncSparseParamUpdateRecorder::init_flag_;
-std::unique_ptr<AsyncSparseParamUpdateRecorder>
-    AsyncSparseParamUpdateRecorder::recorder_(nullptr);
+void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+}
 
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
+void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+  weight_arr.push_back(weight);
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_edge.h b/paddle/fluid/distributed/table/graph/graph_edge.h
new file mode 100644
index 00000000000000..3dfe5a6f357a7c
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_edge.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+namespace paddle {
+namespace distributed {
+
+class GraphEdgeBlob {
+ public:
+  GraphEdgeBlob() {}
+  virtual ~GraphEdgeBlob() {}
+  size_t size() { return id_arr.size(); }
+  virtual void add_edge(uint64_t id, float weight);
+  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual float get_weight(int idx) { return 1; }
+
+ protected:
+  std::vector<uint64_t> id_arr;
+};
+
+class WeightedGraphEdgeBlob : public GraphEdgeBlob {
+ public:
+  WeightedGraphEdgeBlob() {}
+  virtual ~WeightedGraphEdgeBlob() {}
+  virtual void add_edge(uint64_t id, float weight);
+  virtual float get_weight(int idx) { return weight_arr[idx]; }
+
+ protected:
+  std::vector<float> weight_arr;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc
new file mode 100644
index 00000000000000..816d31b979072c
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_node.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+GraphNode::~GraphNode() {
+  if (sampler != nullptr) {
+    delete sampler;
+    sampler = nullptr;
+  }
+  if (edges != nullptr) {
+    delete edges;
+    edges = nullptr;
+  }
+}
+
+int Node::weight_size = sizeof(float);
+int Node::id_size = sizeof(uint64_t);
+int Node::int_size = sizeof(int);
+
+int Node::get_size(bool need_feature) { return id_size + int_size; }
+
+void Node::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  memcpy(buffer, &feat_num, sizeof(int));
+}
+
+void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
+
+int FeatureNode::get_size(bool need_feature) {
+  int size = id_size + int_size;  // id, feat_num
+  if (need_feature) {
+    size += feature.size() * int_size;
+    for (const std::string& fea : feature) {
+      size += fea.size();
+    }
+  }
+  return size;
+}
+
+void GraphNode::build_edges(bool is_weighted) {
+  if (edges == nullptr) {
+    if (is_weighted == true) {
+      edges = new WeightedGraphEdgeBlob();
+    } else {
+      edges = new GraphEdgeBlob();
+    }
+  }
+}
+void GraphNode::build_sampler(std::string sample_type) {
+  if (sample_type == "random") {
+    sampler = new RandomSampler();
+  } else if (sample_type == "weighted") {
+    sampler = new WeightedSampler();
+  }
+  sampler->build(edges);
+}
+void FeatureNode::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  int feat_len;
+  if (need_feature) {
+    feat_num += feature.size();
+    memcpy(buffer, &feat_num, sizeof(int));
+    buffer += sizeof(int);
+    for (int i = 0; i < feat_num; ++i) {
+      feat_len = feature[i].size();
+      memcpy(buffer, &feat_len, sizeof(int));
+      buffer += sizeof(int);
+      memcpy(buffer, feature[i].c_str(), feature[i].size());
+      buffer += feature[i].size();
+    }
+  } else {
+    memcpy(buffer, &feat_num, sizeof(int));
+  }
+}
+void FeatureNode::recover_from_buffer(char* buffer) {
+  int feat_num, feat_len;
+  memcpy(&id, buffer, id_size);
+  buffer += id_size;
+
+  memcpy(&feat_num, buffer, sizeof(int));
+  buffer += sizeof(int);
+
+  feature.clear();
+  for (int i = 0; i < feat_num; ++i) {
+    memcpy(&feat_len, buffer, sizeof(int));
+    buffer += sizeof(int);
+
+    char str[feat_len + 1];
+    memcpy(str, buffer, feat_len);
+    buffer += feat_len;
+    str[feat_len] = '\0';
+    feature.push_back(std::string(str));
+  }
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/table/graph/graph_node.h
new file mode 100644
index 00000000000000..8ad795ac97b549
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_node.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h"
+namespace paddle {
+namespace distributed {
+
+class Node {
+ public:
+  Node() {}
+  Node(uint64_t id) : id(id) {}
+  virtual ~Node() {}
+  static int id_size, int_size, weight_size;
+  uint64_t get_id() { return id; }
+  void set_id(uint64_t id) { this->id = id; }
+
+  virtual void build_edges(bool is_weighted) {}
+  virtual void build_sampler(std::string sample_type) {}
+  virtual void add_edge(uint64_t id, float weight) {}
+  virtual std::vector<int> sample_k(int k) { return std::vector<int>(); }
+  virtual uint64_t get_neighbor_id(int idx) { return 0; }
+  virtual float get_neighbor_weight(int idx) { return 1.; }
+
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) { return std::string(""); }
+  virtual void set_feature(int idx, std::string str) {}
+  virtual void set_feature_size(int size) {}
+  virtual int get_feature_size() { return 0; }
+
+ protected:
+  uint64_t id;
+};
+
+class GraphNode : public Node {
+ public:
+  GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
+  GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
+  virtual ~GraphNode();
+  virtual void build_edges(bool is_weighted);
+  virtual void build_sampler(std::string sample_type);
+  virtual void add_edge(uint64_t id, float weight) {
+    edges->add_edge(id, weight);
+  }
+  virtual std::vector<int> sample_k(int k) { return sampler->sample_k(k); }
+  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
+  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+
+ protected:
+  Sampler *sampler;
+  GraphEdgeBlob *edges;
+};
+
+class FeatureNode : public Node {
+ public:
+  FeatureNode() : Node() {}
+  FeatureNode(uint64_t id) : Node(id) {}
+  virtual ~FeatureNode() {}
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) {
+    if (idx < (int)this->feature.size()) {
+      return this->feature[idx];
+    } else {
+      return std::string("");
+    }
+  }
+
+  virtual void set_feature(int idx, std::string str) {
+    if (idx >= (int)this->feature.size()) {
+      this->feature.resize(idx + 1);
+    }
+    this->feature[idx] = str;
+  }
+  virtual void set_feature_size(int size) { this->feature.resize(size); }
+  virtual int get_feature_size() { return this->feature.size(); }
+
+  template <typename T>
+  static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
+    T v;
+    size_t Tsize = sizeof(T) * feat_str.size();
+    char buffer[Tsize];
+    for (size_t i = 0; i < feat_str.size(); i++) {
+      std::stringstream ss(feat_str[i]);
+      ss >> v;
+      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
+    }
+    return std::string(buffer, Tsize);
+  }
+
+  template <typename T>
+  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
+    T v;
+    std::vector<T> out;
+    size_t start = 0;
+    const char *buffer = feat_str.data();
+    while (start < feat_str.size()) {
+      std::memcpy((char *)&v, buffer + start, sizeof(T));
+      start += sizeof(T);
+      out.push_back(v);
+    }
+    return out;
+  }
+
+ protected:
+  std::vector<std::string> feature;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
new file mode 100644
index 00000000000000..3a680875e3df4a
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h"
+#include <iostream>
+#include <unordered_map>
+namespace paddle {
+namespace distributed {
+
+void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
+
+std::vector<int> RandomSampler::sample_k(int k) {
+  int n = edges->size();
+  if (k > n) {
+    k = n;
+  }
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  std::vector<int> sample_result;
+  std::unordered_map<int, int> replace_map;
+  while (k--) {
+    int rand_int = rand() % n;
+    auto iter = replace_map.find(rand_int);
+    if (iter == replace_map.end()) {
+      sample_result.push_back(rand_int);
+    } else {
+      sample_result.push_back(iter->second);
+    }
+
+    iter = replace_map.find(n - 1);
+    if (iter == replace_map.end()) {
+      replace_map[rand_int] = n - 1;
+    } else {
+      replace_map[rand_int] = iter->second;
+    }
+    --n;
+  }
+  return sample_result;
+}
+
+WeightedSampler::WeightedSampler() {
+  left = nullptr;
+  right = nullptr;
+  edges = nullptr;
+}
+
+WeightedSampler::~WeightedSampler() {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+}
+
+void WeightedSampler::build(GraphEdgeBlob *edges) {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+  return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
+}
+
+void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start,
+                                int end) {
+  count = 0;
+  this->edges = edges;
+  if (start + 1 == end) {
+    left = right = nullptr;
+    idx = start;
+    count = 1;
+    weight = edges->get_weight(idx);
+
+  } else {
+    left = new WeightedSampler();
+    right = new WeightedSampler();
+    left->build_one(edges, start, start + (end - start) / 2);
+    right->build_one(edges, start + (end - start) / 2, end);
+    weight = left->weight + right->weight;
+    count = left->count + right->count;
+  }
+}
+std::vector<int> WeightedSampler::sample_k(int k) {
+  if (k > count) {
+    k = count;
+  }
+  std::vector<int> sample_result;
+  float subtract;
+  std::unordered_map<WeightedSampler *, float> subtract_weight_map;
+  std::unordered_map<WeightedSampler *, int> subtract_count_map;
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  while (k--) {
+    float query_weight = rand() % 100000 / 100000.0;
+    query_weight *= weight - subtract_weight_map[this];
+    sample_result.push_back(sample(query_weight, subtract_weight_map,
+                                   subtract_count_map, subtract));
+  }
+  return sample_result;
+}
+
+int WeightedSampler::sample(
+    float query_weight,
+    std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+    std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+    float &subtract) {
+  if (left == nullptr) {
+    subtract_weight_map[this] = weight;
+    subtract = weight;
+    subtract_count_map[this] = 1;
+    return idx;
+  }
+  int left_count = left->count - subtract_count_map[left];
+  int right_count = right->count - subtract_count_map[right];
+  float left_subtract = subtract_weight_map[left];
+  int return_idx;
+  if (right_count == 0 ||
+      left_count > 0 && left->weight - left_subtract >= query_weight) {
+    return_idx = left->sample(query_weight, subtract_weight_map,
+                              subtract_count_map, subtract);
+  } else {
+    return_idx =
+        right->sample(query_weight - (left->weight - left_subtract),
+                      subtract_weight_map, subtract_count_map, subtract);
+  }
+  subtract_weight_map[this] += subtract;
+  subtract_count_map[this]++;
+  return return_idx;
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
new file mode 100644
index 00000000000000..1787ab23b04316
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctime>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph/graph_edge.h"
+namespace paddle {
+namespace distributed {
+
+class Sampler {
+ public:
+  virtual ~Sampler() {}
+  virtual void build(GraphEdgeBlob *edges) = 0;
+  virtual std::vector<int> sample_k(int k) = 0;
+};
+
+class RandomSampler : public Sampler {
+ public:
+  virtual ~RandomSampler() {}
+  virtual void build(GraphEdgeBlob *edges);
+  virtual std::vector<int> sample_k(int k);
+  GraphEdgeBlob *edges;
+};
+
+class WeightedSampler : public Sampler {
+ public:
+  WeightedSampler();
+  virtual ~WeightedSampler();
+  WeightedSampler *left, *right;
+  float weight;
+  int count;
+  int idx;
+  GraphEdgeBlob *edges;
+  virtual void build(GraphEdgeBlob *edges);
+  virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
+  virtual std::vector<int> sample_k(int k);
+
+ private:
+  int sample(float query_weight,
+             std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+             std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+             float &subtract);
+};
+}
+}
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.cc b/paddle/fluid/distributed/table/graph_edge.cc
similarity index 62%
rename from paddle/fluid/operators/distributed/large_scale_kv.cc
rename to paddle/fluid/distributed/table/graph_edge.cc
index d2673ed6ffb366..cc90f4c6516c18 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.cc
+++ b/paddle/fluid/distributed/table/graph_edge.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,15 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-
+#include "paddle/fluid/distributed/table/graph_edge.h"
+#include <cstring>
 namespace paddle {
-namespace operators {
 namespace distributed {
 
-std::once_flag LargeScaleKV::init_flag_;
-std::shared_ptr<LargeScaleKV> LargeScaleKV::scale_kv_(nullptr);
+void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+}
 
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
+void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+  weight_arr.push_back(weight);
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h
new file mode 100644
index 00000000000000..3dfe5a6f357a7c
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_edge.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+namespace paddle {
+namespace distributed {
+
+class GraphEdgeBlob {
+ public:
+  GraphEdgeBlob() {}
+  virtual ~GraphEdgeBlob() {}
+  size_t size() { return id_arr.size(); }
+  virtual void add_edge(uint64_t id, float weight);
+  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual float get_weight(int idx) { return 1; }
+
+ protected:
+  std::vector<uint64_t> id_arr;
+};
+
+class WeightedGraphEdgeBlob : public GraphEdgeBlob {
+ public:
+  WeightedGraphEdgeBlob() {}
+  virtual ~WeightedGraphEdgeBlob() {}
+  virtual void add_edge(uint64_t id, float weight);
+  virtual float get_weight(int idx) { return weight_arr[idx]; }
+
+ protected:
+  std::vector<float> weight_arr;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc
new file mode 100644
index 00000000000000..27a2cafaf4f0fe
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_node.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph_node.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+GraphNode::~GraphNode() {
+  if (sampler != nullptr) {
+    delete sampler;
+    sampler = nullptr;
+  }
+  if (edges != nullptr) {
+    delete edges;
+    edges = nullptr;
+  }
+}
+
+int Node::weight_size = sizeof(float);
+int Node::id_size = sizeof(uint64_t);
+int Node::int_size = sizeof(int);
+
+int Node::get_size(bool need_feature) { return id_size + int_size; }
+
+void Node::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  memcpy(buffer, &feat_num, sizeof(int));
+}
+
+void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
+
+int FeatureNode::get_size(bool need_feature) {
+  int size = id_size + int_size;  // id, feat_num
+  if (need_feature) {
+    size += feature.size() * int_size;
+    for (const std::string& fea : feature) {
+      size += fea.size();
+    }
+  }
+  return size;
+}
+
+void GraphNode::build_edges(bool is_weighted) {
+  if (edges == nullptr) {
+    if (is_weighted == true) {
+      edges = new WeightedGraphEdgeBlob();
+    } else {
+      edges = new GraphEdgeBlob();
+    }
+  }
+}
+void GraphNode::build_sampler(std::string sample_type) {
+  if (sample_type == "random") {
+    sampler = new RandomSampler();
+  } else if (sample_type == "weighted") {
+    sampler = new WeightedSampler();
+  }
+  sampler->build(edges);
+}
+void FeatureNode::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  int feat_len;
+  if (need_feature) {
+    feat_num += feature.size();
+    memcpy(buffer, &feat_num, sizeof(int));
+    buffer += sizeof(int);
+    for (int i = 0; i < feat_num; ++i) {
+      feat_len = feature[i].size();
+      memcpy(buffer, &feat_len, sizeof(int));
+      buffer += sizeof(int);
+      memcpy(buffer, feature[i].c_str(), feature[i].size());
+      buffer += feature[i].size();
+    }
+  } else {
+    memcpy(buffer, &feat_num, sizeof(int));
+  }
+}
+void FeatureNode::recover_from_buffer(char* buffer) {
+  int feat_num, feat_len;
+  memcpy(&id, buffer, id_size);
+  buffer += id_size;
+
+  memcpy(&feat_num, buffer, sizeof(int));
+  buffer += sizeof(int);
+
+  feature.clear();
+  for (int i = 0; i < feat_num; ++i) {
+    memcpy(&feat_len, buffer, sizeof(int));
+    buffer += sizeof(int);
+
+    char str[feat_len + 1];
+    memcpy(str, buffer, feat_len);
+    buffer += feat_len;
+    str[feat_len] = '\0';
+    feature.push_back(std::string(str));
+  }
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h
new file mode 100644
index 00000000000000..c3e8e3ce5b50d0
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_node.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
+namespace paddle {
+namespace distributed {
+
+class Node {
+ public:
+  Node() {}
+  Node(uint64_t id) : id(id) {}
+  virtual ~Node() {}
+  static int id_size, int_size, weight_size;
+  uint64_t get_id() { return id; }
+  void set_id(uint64_t id) { this->id = id; }
+
+  virtual void build_edges(bool is_weighted) {}
+  virtual void build_sampler(std::string sample_type) {}
+  virtual void add_edge(uint64_t id, float weight) {}
+  virtual std::vector<int> sample_k(int k) { return std::vector<int>(); }
+  virtual uint64_t get_neighbor_id(int idx) { return 0; }
+  virtual float get_neighbor_weight(int idx) { return 1.; }
+
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) { return std::string(""); }
+  virtual void set_feature(int idx, std::string str) {}
+  virtual void set_feature_size(int size) {}
+  virtual int get_feature_size() { return 0; }
+
+ protected:
+  uint64_t id;
+};
+
+class GraphNode : public Node {
+ public:
+  GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
+  GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
+  virtual ~GraphNode();
+  virtual void build_edges(bool is_weighted);
+  virtual void build_sampler(std::string sample_type);
+  virtual void add_edge(uint64_t id, float weight) {
+    edges->add_edge(id, weight);
+  }
+  virtual std::vector<int> sample_k(int k) { return sampler->sample_k(k); }
+  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
+  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+
+ protected:
+  Sampler *sampler;
+  GraphEdgeBlob *edges;
+};
+
+class FeatureNode : public Node {
+ public:
+  FeatureNode() : Node() {}
+  FeatureNode(uint64_t id) : Node(id) {}
+  virtual ~FeatureNode() {}
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) {
+    if (idx < (int)this->feature.size()) {
+      return this->feature[idx];
+    } else {
+      return std::string("");
+    }
+  }
+
+  virtual void set_feature(int idx, std::string str) {
+    if (idx >= (int)this->feature.size()) {
+      this->feature.resize(idx + 1);
+    }
+    this->feature[idx] = str;
+  }
+  virtual void set_feature_size(int size) { this->feature.resize(size); }
+  virtual int get_feature_size() { return this->feature.size(); }
+
+  template <typename T>
+  static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
+    T v;
+    size_t Tsize = sizeof(T) * feat_str.size();
+    char buffer[Tsize];
+    for (size_t i = 0; i < feat_str.size(); i++) {
+      std::stringstream ss(feat_str[i]);
+      ss >> v;
+      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
+    }
+    return std::string(buffer, Tsize);
+  }
+
+  template <typename T>
+  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
+    T v;
+    std::vector<T> out;
+    size_t start = 0;
+    const char *buffer = feat_str.data();
+    while (start < feat_str.size()) {
+      std::memcpy((char *)&v, buffer + start, sizeof(T));
+      start += sizeof(T);
+      out.push_back(v);
+    }
+    return out;
+  }
+
+ protected:
+  std::vector<std::string> feature;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
new file mode 100644
index 00000000000000..059a1d64bc392d
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
+#include <iostream>
+#include <unordered_map>
+namespace paddle {
+namespace distributed {
+
+void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
+
+std::vector<int> RandomSampler::sample_k(int k) {
+  int n = edges->size();
+  if (k > n) {
+    k = n;
+  }
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  std::vector<int> sample_result;
+  std::unordered_map<int, int> replace_map;
+  while (k--) {
+    int rand_int = rand() % n;
+    auto iter = replace_map.find(rand_int);
+    if (iter == replace_map.end()) {
+      sample_result.push_back(rand_int);
+    } else {
+      sample_result.push_back(iter->second);
+    }
+
+    iter = replace_map.find(n - 1);
+    if (iter == replace_map.end()) {
+      replace_map[rand_int] = n - 1;
+    } else {
+      replace_map[rand_int] = iter->second;
+    }
+    --n;
+  }
+  return sample_result;
+}
+
+WeightedSampler::WeightedSampler() {
+  left = nullptr;
+  right = nullptr;
+  edges = nullptr;
+}
+
+WeightedSampler::~WeightedSampler() {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+}
+
+void WeightedSampler::build(GraphEdgeBlob *edges) {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+  return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
+}
+
+void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start,
+                                int end) {
+  count = 0;
+  this->edges = edges;
+  if (start + 1 == end) {
+    left = right = nullptr;
+    idx = start;
+    count = 1;
+    weight = edges->get_weight(idx);
+
+  } else {
+    left = new WeightedSampler();
+    right = new WeightedSampler();
+    left->build_one(edges, start, start + (end - start) / 2);
+    right->build_one(edges, start + (end - start) / 2, end);
+    weight = left->weight + right->weight;
+    count = left->count + right->count;
+  }
+}
+std::vector<int> WeightedSampler::sample_k(int k) {
+  if (k > count) {
+    k = count;
+  }
+  std::vector<int> sample_result;
+  float subtract;
+  std::unordered_map<WeightedSampler *, float> subtract_weight_map;
+  std::unordered_map<WeightedSampler *, int> subtract_count_map;
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  while (k--) {
+    float query_weight = rand() % 100000 / 100000.0;
+    query_weight *= weight - subtract_weight_map[this];
+    sample_result.push_back(sample(query_weight, subtract_weight_map,
+                                   subtract_count_map, subtract));
+  }
+  return sample_result;
+}
+
+int WeightedSampler::sample(
+    float query_weight,
+    std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+    std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+    float &subtract) {
+  if (left == nullptr) {
+    subtract_weight_map[this] = weight;
+    subtract = weight;
+    subtract_count_map[this] = 1;
+    return idx;
+  }
+  int left_count = left->count - subtract_count_map[left];
+  int right_count = right->count - subtract_count_map[right];
+  float left_subtract = subtract_weight_map[left];
+  int return_idx;
+  if (right_count == 0 ||
+      left_count > 0 && left->weight - left_subtract >= query_weight) {
+    return_idx = left->sample(query_weight, subtract_weight_map,
+                              subtract_count_map, subtract);
+  } else {
+    return_idx =
+        right->sample(query_weight - (left->weight - left_subtract),
+                      subtract_weight_map, subtract_count_map, subtract);
+  }
+  subtract_weight_map[this] += subtract;
+  subtract_count_map[this]++;
+  return return_idx;
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h
new file mode 100644
index 00000000000000..cfc341d27c6b76
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_weighted_sampler.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctime>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph_edge.h"
+namespace paddle {
+namespace distributed {
+
+class Sampler {
+ public:
+  virtual ~Sampler() {}
+  virtual void build(GraphEdgeBlob *edges) = 0;
+  virtual std::vector<int> sample_k(int k) = 0;
+};
+
+class RandomSampler : public Sampler {
+ public:
+  virtual ~RandomSampler() {}
+  virtual void build(GraphEdgeBlob *edges);
+  virtual std::vector<int> sample_k(int k);
+  GraphEdgeBlob *edges;
+};
+
+class WeightedSampler : public Sampler {
+ public:
+  WeightedSampler();
+  virtual ~WeightedSampler();
+  WeightedSampler *left, *right;
+  float weight;
+  int count;
+  int idx;
+  GraphEdgeBlob *edges;
+  virtual void build(GraphEdgeBlob *edges);
+  virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
+  virtual std::vector<int> sample_k(int k);
+
+ private:
+  int sample(float query_weight,
+             std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+             std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+             float &subtract);
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/table/sparse_geo_table.cc
index 9b276e7de5c92d..04cd1136382a4e 100644
--- a/paddle/fluid/distributed/table/sparse_geo_table.cc
+++ b/paddle/fluid/distributed/table/sparse_geo_table.cc
@@ -22,8 +22,17 @@ int32_t SparseGeoTable::pull_geo_param(const uint32_t trainer_id,
                                        std::vector<uint64_t>* ids) {
   geo_recorder->GetAndClear(trainer_id, ids);
   auto dim = _config.common().dims()[0];
+
+  std::vector<uint32_t> frequencies;
+  frequencies.resize(ids->size(), 1);
+
+  auto pull_value = PullSparseValue(ids->size(), dim);
+  pull_value.is_training_ = true;
+  pull_value.feasigns_ = ids->data();
+  pull_value.frequencies_ = frequencies.data();
+
   values->resize(ids->size() * dim);
-  CommonSparseTable::pull_sparse(values->data(), ids->data(), ids->size());
+  CommonSparseTable::pull_sparse(values->data(), pull_value);
   return 0;
 }
 
diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc
index dfaaa6ffc12c2b..600be954cb5966 100644
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/distributed/common/registerer.h"
 
 #include "paddle/fluid/distributed/table/common_dense_table.h"
+#include "paddle/fluid/distributed/table/common_graph_table.h"
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
@@ -25,7 +26,7 @@
 
 namespace paddle {
 namespace distributed {
-
+REGISTER_PSCORE_CLASS(Table, GraphTable);
 REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
@@ -75,5 +76,6 @@ int32_t Table::initialize_accessor() {
   _value_accesor.reset(accessor);
   return 0;
 }
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h
index 65c99d2bbd40d4..81a1ff5eced2bb 100644
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
@@ -21,6 +21,8 @@
 #include <string>
 #include <utility>
 #include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -46,10 +48,17 @@ class Table {
     return 0;
   }
 
-  virtual int32_t pull_sparse(float *values, const uint64_t *keys,
-                              size_t num) = 0;
+  virtual int32_t pull_sparse_ptr(char **pull_values, const uint64_t *keys,
+                                  size_t num) {
+    VLOG(0) << "NOT IMPLEMENT";
+    return 0;
+  }
+  virtual int32_t pull_sparse(float *values,
+                              const PullSparseValue &pull_value) = 0;
   virtual int32_t push_sparse(const uint64_t *keys, const float *values,
                               size_t num) = 0;
+  virtual int32_t push_sparse(const uint64_t *keys, const float **values,
+                              size_t num){};
   virtual int32_t push_sparse_param(const uint64_t *keys, const float *values,
                                     size_t num) {
     return 0;
@@ -141,5 +150,6 @@ class TableManager {
   TableManager() {}
   ~TableManager() {}
 };
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/table/tensor_table.h b/paddle/fluid/distributed/table/tensor_table.h
index 1a8f1a9cd9adb8..080682d131420b 100644
--- a/paddle/fluid/distributed/table/tensor_table.h
+++ b/paddle/fluid/distributed/table/tensor_table.h
@@ -52,8 +52,8 @@ class TensorTable : public Table {
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -102,8 +102,8 @@ class DenseTensorTable : public TensorTable {
   DenseTensorTable() {}
   virtual ~DenseTensorTable() {}
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -158,8 +158,8 @@ class GlobalStepTable : public DenseTensorTable {
   GlobalStepTable() {}
   virtual ~GlobalStepTable() {}
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index adedd049023daa..b756c740ac764c 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -15,3 +15,6 @@ cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS s
 
 set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS})
+
+set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index fbd236012f5237..8fb3434af6e281 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -212,8 +212,8 @@ void RunBrpcPushSparse() {
 
   /*-----------------------Test Server Init----------------------------------*/
   LOG(INFO) << "Run pull_sparse_param";
-  auto pull_status = worker_ptr_->pull_sparse(fea_value_ptr.data(), 0,
-                                              fea_keys.data(), fea_keys.size());
+  auto pull_status = worker_ptr_->pull_sparse(
+      fea_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_status.wait();
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
     fea_values.data()[idx] *= 2.0;
@@ -241,7 +241,7 @@ void RunBrpcPushSparse() {
   push_status.wait();
 
   auto pull_param_status = worker_ptr_->pull_sparse(
-      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size());
+      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_param_status.wait();
 
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
@@ -275,7 +275,7 @@ void RunBrpcPushSparse() {
   push_grad_status.wait();
 
   auto pull_update_status = worker_ptr_->pull_sparse(
-      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size());
+      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_update_status.wait();
 
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
diff --git a/paddle/fluid/distributed/test/geo_table_test.cc b/paddle/fluid/distributed/test/geo_table_test.cc
index 22e11acf6584ee..c9f15db3f788e1 100644
--- a/paddle/fluid/distributed/test/geo_table_test.cc
+++ b/paddle/fluid/distributed/test/geo_table_test.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/table/common_dense_table.h"
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
 #include "paddle/fluid/distributed/table/table.h"
 
@@ -53,14 +54,18 @@ TEST(SparseGeoTable, SSUM) {
 
   // test push_sparse_param, and create params
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
   std::vector<float> init_values;
   for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
     init_values.push_back(0.0);
   }
   table->push_sparse_param(init_keys.data(), init_values.data(),
                            init_keys.size());
+
   std::vector<float> pull_values(init_values.size());
-  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(pull_values.data(), value);
+
   for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
     ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
   }
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
new file mode 100644
index 00000000000000..b268bb449e1461
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -0,0 +1,556 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+void testSampleNodes(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<uint64_t> ids;
+  auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids);
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {37, 59};
+  pull_status.wait();
+  for (auto id : ids) s.insert(id);
+  ASSERT_EQ(true, s.size() == s1.size());
+  for (auto id : s) {
+    ASSERT_EQ(true, s1.find(id) != s1.end());
+  }
+}
+
+void testFeatureNodeSerializeInt() {
+  std::string out =
+      distributed::FeatureNode::parse_value_to_bytes<int32_t>({"123", "345"});
+  std::vector<int32_t> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<int32_t>(out);
+  ASSERT_EQ(out2[0], 123);
+  ASSERT_EQ(out2[1], 345);
+}
+
+void testFeatureNodeSerializeInt64() {
+  std::string out =
+      distributed::FeatureNode::parse_value_to_bytes<int64_t>({"123", "345"});
+  std::vector<int64_t> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<int64_t>(out);
+  ASSERT_EQ(out2[0], 123);
+  ASSERT_EQ(out2[1], 345);
+}
+
+void testFeatureNodeSerializeFloat32() {
+  std::string out = distributed::FeatureNode::parse_value_to_bytes<float>(
+      {"123.123", "345.123"});
+  std::vector<float> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<float>(out);
+  float eps;
+  std::cout << "Float " << out2[0] << " " << 123.123 << std::endl;
+  eps = out2[0] - 123.123;
+  ASSERT_LE(eps * eps, 1e-5);
+  eps = out2[1] - 345.123;
+  ASSERT_LE(eps * eps, 1e-5);
+}
+
+void testFeatureNodeSerializeFloat64() {
+  std::string out = distributed::FeatureNode::parse_value_to_bytes<double>(
+      {"123.123", "345.123"});
+  std::vector<double> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<double>(out);
+  float eps;
+  eps = out2[0] - 123.123;
+  std::cout << "Float64 " << out2[0] << " " << 123.123 << std::endl;
+  ASSERT_LE(eps * eps, 1e-5);
+  eps = out2[1] - 345.123;
+  ASSERT_LE(eps * eps, 1e-5);
+}
+
+void testSingleSampleNeighboor(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  auto pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 37), 4, vs);
+  pull_status.wait();
+
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+  VLOG(0) << "test single done";
+  s.clear();
+  s1.clear();
+  vs.clear();
+  pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 96), 4, vs);
+  pull_status.wait();
+  s1 = {111, 48, 247};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+}
+
+void testBatchSampleNeighboor(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  std::vector<std::uint64_t> v = {37, 96};
+  auto pull_status = worker_ptr_->batch_sample_neighboors(0, v, 4, vs);
+  pull_status.wait();
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+  s.clear();
+  s1.clear();
+  s1 = {111, 48, 247};
+  for (auto g : vs[1]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+}
+
+void testGraphToBuffer();
+// std::string nodes[] = {std::string("37\taa\t45;0.34\t145;0.31\t112;0.21"),
+//                        std::string("96\tfeature\t48;1.4\t247;0.31\t111;1.21"),
+//                        std::string("59\ttreat\t45;0.34\t145;0.31\t112;0.21"),
+//                        std::string("97\tfood\t48;1.4\t247;0.31\t111;1.21")};
+
+std::string edges[] = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+char edge_file_name[] = "edges.txt";
+
+std::string nodes[] = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+void prepare_file(char file_name[], bool load_edge) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  if (load_edge) {
+    for (auto x : edges) {
+      ofile << x << std::endl;
+    }
+  } else {
+    for (auto x : nodes) {
+      ofile << x << std::endl;
+    }
+  }
+  ofile.close();
+}
+void GetDownpourSparseTableProto(
+    ::paddle::distributed::TableParameter* sparse_table_proto) {
+  sparse_table_proto->set_table_id(0);
+  sparse_table_proto->set_table_class("GraphTable");
+  sparse_table_proto->set_shard_num(127);
+  sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+  ::paddle::distributed::TableAccessorParameter* accessor_proto =
+      sparse_table_proto->mutable_accessor();
+  accessor_proto->set_accessor_class("CommMergeAccessor");
+}
+
+::paddle::distributed::PSParameter GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(sparse_table_proto);
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+      downpour_worker_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(worker_sparse_table_proto);
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* server_sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(server_sparse_table_proto);
+
+  return worker_fleet_desc;
+}
+
+/*-------------------------------------------------------------------------*/
+
+std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1";
+uint32_t port_ = 5209, port2 = 5210;
+
+std::vector<std::string> host_sign_list_;
+
+std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr_,
+    pserver_ptr2;
+
+std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr_;
+
+void RunServer() {
+  LOG(INFO) << "init first server";
+  ::paddle::distributed::PSParameter server_proto = GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec);
+  LOG(INFO) << "first server, run start(ip,port)";
+  pserver_ptr_->start(ip_, port_);
+  LOG(INFO) << "init first server Done";
+}
+
+void RunServer2() {
+  LOG(INFO) << "init second server";
+  ::paddle::distributed::PSParameter server_proto2 = GetServerProto();
+
+  auto _ps_env2 = paddle::distributed::PaddlePSEnvironment();
+  _ps_env2.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr2 = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto2));
+  std::vector<framework::ProgramDesc> empty_vec2;
+  framework::ProgramDesc empty_prog2;
+  empty_vec2.push_back(empty_prog2);
+  pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2);
+  pserver_ptr2->start(ip2, port2);
+}
+
+void RunClient(
+    std::map<uint64_t, std::vector<paddle::distributed::Region>>& dense_regions,
+    int index, paddle::distributed::PsBaseService* service) {
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list_.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, servers_);
+  worker_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0);
+  worker_ptr_->set_shard_num(127);
+  worker_ptr_->set_local_channel(index);
+  worker_ptr_->set_local_graph_service(
+      (paddle::distributed::GraphBrpcService*)service);
+}
+
+void RunBrpcPushSparse() {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  prepare_file(edge_file_name, 1);
+  prepare_file(node_file_name, 0);
+  auto ph_host = paddle::distributed::PSHost(ip_, port_, 0);
+  host_sign_list_.push_back(ph_host.serialize_to_string());
+
+  // test-start
+  auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1);
+  host_sign_list_.push_back(ph_host2.serialize_to_string());
+  // test-end
+  // Srart Server
+  std::thread* server_thread = new std::thread(RunServer);
+  std::thread* server_thread2 = new std::thread(RunServer2);
+  sleep(1);
+
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+
+  RunClient(dense_regions, 0, pserver_ptr_->get_service());
+
+  /*-----------------------Test Server Init----------------------------------*/
+  auto pull_status =
+      worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
+  srand(time(0));
+  pull_status.wait();
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  testSampleNodes(worker_ptr_);
+  sleep(5);
+  testSingleSampleNeighboor(worker_ptr_);
+  testBatchSampleNeighboor(worker_ptr_);
+  pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 10240001024), 4, vs);
+  pull_status.wait();
+  ASSERT_EQ(0, vs[0].size());
+
+  std::vector<distributed::FeatureNode> nodes;
+  pull_status = worker_ptr_->pull_graph_list(0, 0, 0, 1, 1, nodes);
+  pull_status.wait();
+  ASSERT_EQ(nodes.size(), 1);
+  ASSERT_EQ(nodes[0].get_id(), 37);
+  nodes.clear();
+  pull_status = worker_ptr_->pull_graph_list(0, 0, 1, 4, 1, nodes);
+  pull_status.wait();
+  ASSERT_EQ(nodes.size(), 1);
+  ASSERT_EQ(nodes[0].get_id(), 59);
+  for (auto g : nodes) {
+    std::cout << g.get_id() << std::endl;
+  }
+  distributed::GraphPyServer server1, server2;
+  distributed::GraphPyClient client1, client2;
+  std::string ips_str = "127.0.0.1:5211;127.0.0.1:5212";
+  std::vector<std::string> edge_types = {std::string("user2item")};
+  std::vector<std::string> node_types = {std::string("user"),
+                                         std::string("item")};
+  VLOG(0) << "make 2 servers";
+  server1.set_up(ips_str, 127, node_types, edge_types, 0);
+  server2.set_up(ips_str, 127, node_types, edge_types, 1);
+
+  server1.add_table_feat_conf("user", "a", "float32", 1);
+  server1.add_table_feat_conf("user", "b", "int32", 2);
+  server1.add_table_feat_conf("user", "c", "string", 1);
+  server1.add_table_feat_conf("user", "d", "string", 1);
+  server1.add_table_feat_conf("item", "a", "float32", 1);
+
+  server2.add_table_feat_conf("user", "a", "float32", 1);
+  server2.add_table_feat_conf("user", "b", "int32", 2);
+  server2.add_table_feat_conf("user", "c", "string", 1);
+  server2.add_table_feat_conf("user", "d", "string", 1);
+  server2.add_table_feat_conf("item", "a", "float32", 1);
+
+  client1.set_up(ips_str, 127, node_types, edge_types, 0);
+
+  client1.add_table_feat_conf("user", "a", "float32", 1);
+  client1.add_table_feat_conf("user", "b", "int32", 2);
+  client1.add_table_feat_conf("user", "c", "string", 1);
+  client1.add_table_feat_conf("user", "d", "string", 1);
+  client1.add_table_feat_conf("item", "a", "float32", 1);
+
+  client2.set_up(ips_str, 127, node_types, edge_types, 1);
+
+  client2.add_table_feat_conf("user", "a", "float32", 1);
+  client2.add_table_feat_conf("user", "b", "int32", 2);
+  client2.add_table_feat_conf("user", "c", "string", 1);
+  client2.add_table_feat_conf("user", "d", "string", 1);
+  client2.add_table_feat_conf("item", "a", "float32", 1);
+
+  server1.start_server(false);
+  std::cout << "first server done" << std::endl;
+  server2.start_server(false);
+  std::cout << "second server done" << std::endl;
+  client1.start_client();
+  std::cout << "first client done" << std::endl;
+  client2.start_client();
+  std::cout << "first client done" << std::endl;
+  std::cout << "started" << std::endl;
+  VLOG(0) << "come to set local server";
+  client1.bind_local_server(0, server1);
+  VLOG(0) << "first bound";
+  client2.bind_local_server(1, server2);
+  VLOG(0) << "second bound";
+  client1.load_node_file(std::string("user"), std::string(node_file_name));
+  client1.load_node_file(std::string("item"), std::string(node_file_name));
+  client1.load_edge_file(std::string("user2item"), std::string(edge_file_name),
+                         0);
+  nodes.clear();
+
+  nodes = client1.pull_graph_list(std::string("user"), 0, 1, 4, 1);
+
+  ASSERT_EQ(nodes[0].get_id(), 59);
+  nodes.clear();
+
+  // Test Pull by step
+
+  std::unordered_set<uint64_t> count_item_nodes;
+  // pull by step 2
+  for (int test_step = 1; test_step < 4; test_step++) {
+    count_item_nodes.clear();
+    std::cout << "check pull graph list by step " << test_step << std::endl;
+    for (int server_id = 0; server_id < 2; server_id++) {
+      for (int start_step = 0; start_step < test_step; start_step++) {
+        nodes = client1.pull_graph_list(std::string("item"), server_id,
+                                        start_step, 12, test_step);
+        for (auto g : nodes) {
+          count_item_nodes.insert(g.get_id());
+        }
+        nodes.clear();
+      }
+    }
+    ASSERT_EQ(count_item_nodes.size(), 12);
+  }
+
+  vs = client1.batch_sample_neighboors(std::string("user2item"),
+                                       std::vector<uint64_t>(1, 96), 4);
+  ASSERT_EQ(vs[0].size(), 3);
+  std::vector<uint64_t> node_ids;
+  node_ids.push_back(96);
+  node_ids.push_back(37);
+  vs = client1.batch_sample_neighboors(std::string("user2item"), node_ids, 4);
+
+  ASSERT_EQ(vs.size(), 2);
+  std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
+  ASSERT_EQ(nodes_ids.size(), 2);
+  ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) ||
+                      (nodes_ids[0] == 37 && nodes_ids[1] == 59));
+
+  // Test get node feat
+  node_ids.clear();
+  node_ids.push_back(37);
+  node_ids.push_back(96);
+  std::vector<std::string> feature_names;
+  feature_names.push_back(std::string("c"));
+  feature_names.push_back(std::string("d"));
+  auto node_feat =
+      client1.get_node_feat(std::string("user"), node_ids, feature_names);
+  ASSERT_EQ(node_feat.size(), 2);
+  ASSERT_EQ(node_feat[0].size(), 2);
+  VLOG(0) << "get_node_feat: " << node_feat[0][0];
+  VLOG(0) << "get_node_feat: " << node_feat[0][1];
+  VLOG(0) << "get_node_feat: " << node_feat[1][0];
+  VLOG(0) << "get_node_feat: " << node_feat[1][1];
+
+  // Test string
+  node_ids.clear();
+  node_ids.push_back(37);
+  node_ids.push_back(96);
+  // std::vector<std::string> feature_names;
+  feature_names.clear();
+  feature_names.push_back(std::string("a"));
+  feature_names.push_back(std::string("b"));
+  node_feat =
+      client1.get_node_feat(std::string("user"), node_ids, feature_names);
+  ASSERT_EQ(node_feat.size(), 2);
+  ASSERT_EQ(node_feat[0].size(), 2);
+  VLOG(0) << "get_node_feat: " << node_feat[0][0].size();
+  VLOG(0) << "get_node_feat: " << node_feat[0][1].size();
+  VLOG(0) << "get_node_feat: " << node_feat[1][0].size();
+  VLOG(0) << "get_node_feat: " << node_feat[1][1].size();
+
+  std::remove(edge_file_name);
+  std::remove(node_file_name);
+  LOG(INFO) << "Run stop_server";
+  worker_ptr_->stop_server();
+  LOG(INFO) << "Run finalize_worker";
+  worker_ptr_->finalize_worker();
+  testFeatureNodeSerializeInt();
+  testFeatureNodeSerializeInt64();
+  testFeatureNodeSerializeFloat32();
+  testFeatureNodeSerializeFloat64();
+  testGraphToBuffer();
+  client1.stop_server();
+}
+
+void testGraphToBuffer() {
+  ::paddle::distributed::GraphNode s, s1;
+  s.set_feature_size(1);
+  s.set_feature(0, std::string("hhhh"));
+  s.set_id(65);
+  int size = s.get_size(true);
+  char str[size];
+  s.to_buffer(str, true);
+  s1.recover_from_buffer(str);
+  ASSERT_EQ(s.get_id(), s1.get_id());
+  VLOG(0) << s.get_feature(0);
+  VLOG(0) << s1.get_feature(0);
+}
+
+TEST(RunBrpcPushSparse, Run) { RunBrpcPushSparse(); }
diff --git a/paddle/fluid/distributed/test/sparse_table_test.cc b/paddle/fluid/distributed/test/sparse_table_test.cc
index 6db95c5fac211b..26bede392d6fad 100644
--- a/paddle/fluid/distributed/test/sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/sparse_table_test.cc
@@ -55,9 +55,14 @@ TEST(CommonSparseTable, SGD) {
 
   // pull parameters for create and check
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+
   std::vector<float> init_values;
   init_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size());
+
+  std::vector<float> pull_values(init_values.size());
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(init_values.data(), value);
 
   // for check
   std::vector<float> total_gradients;
@@ -100,7 +105,8 @@ TEST(CommonSparseTable, SGD) {
 
   std::vector<float> pull_values;
   pull_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
+  table->pull_sparse(init_values.data(), value);
+
   for (size_t i = 0; i < init_values.size(); ++i) {
     auto update_val = init_values[i] - 1.0 * total_gradients[i];
     ASSERT_TRUE(abs(update_val - pull_values[i]) < 1e-5);
@@ -148,9 +154,13 @@ TEST(CommonSparseTable, Adam) {
 
   // pull parameters for create and check
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+
   std::vector<float> init_values;
   init_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size());
+
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(init_values.data(), value);
 
   // push gradient
   std::vector<std::vector<uint64_t>> trainer_keys;
diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h
index 52606b2a7f59e0..fa91490e6cd8af 100644
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@@ -113,6 +113,9 @@ class PD_DLL_DECL Tensor {
   /// \brief Cast datatype from one to another
   Tensor cast(const DataType& target_type) const;
 
+  /// \brief Check Tensor is initialized
+  bool is_initialized() const;
+
 #ifdef PADDLE_WITH_CUDA
   /// \bref Get current stream of Tensor
   cudaStream_t stream() const;
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index e9705e2101cc3c..8b2f7cc5bf13c9 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -103,15 +103,6 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
 void Tensor::reshape(const std::vector<int64_t> &shape) {
   GET_CASTED_TENSOR
   auto new_dim = framework::make_ddim(shape);
-  if (tensor->numel() != framework::product(new_dim)) {
-    LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger "
-                    "or smaller"
-                 << "than original shape will not change your tensor's memory "
-                    "Please call"
-                 << "paddle::Tensor::mutable_data<T>() after to reallocate "
-                    "your tensor's size."
-                 << std::endl;
-  }
   tensor->Resize(new_dim);
 }
 
@@ -393,6 +384,15 @@ int64_t Tensor::size() const {
   return tensor->numel();
 }
 
+bool Tensor::is_initialized() const {
+  GET_CASTED_TENSOR;
+  if (tensor->IsInitialized()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 #ifdef PADDLE_WITH_CUDA
 cudaStream_t Tensor::stream() const {
   if (!stream_.IsStreamSet()) {
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1fa4ce9b573a09..24bed277280839 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -191,13 +191,15 @@ if(WITH_PYTHON)
   py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
 #Generate an empty \
     #__init__.py to make framework_py_proto as a valid python module.
+  add_custom_target(fleet_proto_init ALL  
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py 
+  )
   add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
       COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
@@ -207,8 +209,6 @@ if(WITH_PYTHON)
     string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
           COMMAND copy /Y *.py ${proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
@@ -217,6 +217,12 @@ if(WITH_PYTHON)
   endif(NOT WIN32)
 endif()
 
+if (WITH_PSCORE)
+  add_custom_target(index_dataset_proto_init ALL DEPENDS fleet_proto_init index_dataset_py_proto
+    COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+    COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.")
+endif(WITH_PSCORE)
+
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
@@ -360,71 +366,30 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
-# Old custom op extension mechanism related, will be removed in 2.1.0
-cc_library(paddle_framework_shared
-    SHARED SRCS executor.cc operator.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
-    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
-    DEPS ${FLUID_FRAMEWORK_MODULES})
-get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework)
-target_link_libraries(paddle_framework_shared ${os_dependency_modules})
-
-if (LINUX)
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so
-      CACHE INTERNAL "Fluid framework lib")
-endif()
-
-if (WIN32)
-  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  endif()
-  set(FLUID_FRAMEWORK_IMPORT_LIB
-      ${paddle_framework_lib_path}/paddle_framework.lib
-      CACHE INTERNAL "Fluid framework lib")
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${paddle_framework_lib_path}/paddle_framework.dll
-      CACHE INTERNAL "Fluid framework dll")
-endif()
-
-if(APPLE)
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib
-      CACHE INTERNAL "Fluid framework lib")
-endif()
 if(WITH_TESTING AND TEST selected_rows_test)
   set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
 
-# New custom op extension mechanism related
+##### 2.0 New custom op extension mechanism related #####
 
 # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
-set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
-
-set(PADDLE_CUSTOM_OP_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc
-    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc)
-set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE)
+if (WIN32)
+  set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
 
-cc_library(paddle_custom_op_shared
-    SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES})
+  set(PADDLE_CUSTOM_OP_SRCS
+      ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc
+      ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc
+      ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc
+      ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc)
+  set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE)
 
-get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op)
-target_link_libraries(paddle_custom_op_shared ${os_dependency_modules})
+  cc_library(paddle_custom_op_shared
+      SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES})
 
-if (LINUX)
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_custom_op.so
-      CACHE INTERNAL "Paddle custom op lib")
-endif()
+  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+  set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op)
+  target_link_libraries(paddle_custom_op_shared ${os_dependency_modules})
 
-if (WIN32)
   if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
     set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR})
   else()
@@ -437,9 +402,3 @@ if (WIN32)
       ${paddle_custom_op_lib_path}/paddle_custom_op.dll
       CACHE INTERNAL "Paddle custom op dll")
 endif()
-
-if(APPLE)
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/paddle_custom_op.dylib
-      CACHE INTERNAL "Paddle custom op lib")
-endif()
diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc
deleted file mode 100644
index 5e73c5cc23afa4..00000000000000
--- a/paddle/fluid/framework/c/c_api.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/c/c_api.h"
-
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-extern "C" {
-
-paddle::framework::OpInfoMap &PD_GetOpInfoMap() {
-  return paddle::framework::OpInfoMap::Instance();
-}
-
-void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) {
-  paddle::platform::DeviceContextPool::SetPool(pool);
-}
-
-std::vector<std::string> PD_GetGradOpDescStrs(
-    const paddle::framework::OpDesc &op_desc,
-    const std::unordered_set<std::string> &no_grad_set,
-    std::unordered_map<std::string, std::string> *grad_to_var,
-    const std::vector<paddle::framework::BlockDesc *> &grad_block) {
-  auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type());
-  std::vector<std::string> ret;
-  if (op_info.grad_op_maker_) {
-    auto grad_op_descs =
-        op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block);
-    size_t op_num = grad_op_descs.size();
-    ret.resize(op_num);
-    for (size_t i = 0; i < op_num; ++i) {
-      PADDLE_ENFORCE_EQ(
-          grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
-          paddle::platform::errors::Unavailable(
-              "Cannot serialize operator desc message."));
-    }
-  }
-  return ret;
-}
-
-}  // end extern "C"
diff --git a/paddle/fluid/framework/c/c_api.h b/paddle/fluid/framework/c/c_api.h
deleted file mode 100644
index a9ec402f381e43..00000000000000
--- a/paddle/fluid/framework/c/c_api.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* copyright (c) 2019 paddlepaddle authors. all rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class OpInfoMap;
-}  // namespace framework
-namespace platform {
-class DeviceContextPool;
-}  // namespace platform
-}  // namespace paddle
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global OpInfo map.
-paddle::framework::OpInfoMap &PD_GetOpInfoMap();
-
-// C-API to init global DeviceContextPool from outside.
-void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool);
-
-// C-API to serialize the grad op protocol message to a binary string.
-std::vector<std::string> PD_GetGradOpDescStrs(
-    const paddle::framework::OpDesc &op_desc,
-    const std::unordered_set<std::string> &no_grad_set,
-    std::unordered_map<std::string, std::string> *grad_to_var,
-    const std::vector<paddle::framework::BlockDesc *> &grad_block);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 8d6fd4efd5ae3d..a65dcbd55f9463 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -220,6 +220,21 @@ void GroupTestDtypeConvert() {
         paddle::DataType::FLOAT16);
 }
 
+void TestInitilized() {
+  paddle::Tensor test_tensor(paddle::PlaceType::kCPU);
+  CHECK(test_tensor.is_initialized() == false);
+  test_tensor.reshape({1, 1});
+  test_tensor.mutable_data<float>();
+  CHECK(test_tensor.is_initialized() == true);
+  float* tensor_data = test_tensor.data<float>();
+  for (int i = 0; i < test_tensor.size(); i++) {
+    tensor_data[i] = 0.5;
+  }
+  for (int i = 0; i < test_tensor.size(); i++) {
+    CHECK(tensor_data[i] == 0.5);
+  }
+}
+
 TEST(CustomTensor, copyTest) {
   VLOG(2) << "TestCopy";
   GroupTestCopy();
@@ -233,4 +248,6 @@ TEST(CustomTensor, copyTest) {
   GroupTestCast();
   VLOG(2) << "TestDtypeConvert";
   GroupTestDtypeConvert();
+  VLOG(2) << "TestInitilized";
+  TestInitilized();
 }
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 34c87b8388975a..5636e3ed1b63f9 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -161,9 +161,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
-#else
-    LOG(WARNING) << "fusion_group is not enabled for Windows/MacOS now, and "
-                    "only effective when running with CUDA GPU.";
 #endif
     AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                         "fuse_elewise_add_act_pass");
@@ -265,12 +262,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     if (FLAGS_use_mkldnn) {
       AppendPass(pass_name);
     } else if (!strategy_.mkldnn_enabled_op_types_.empty()) {
-      LOG(WARNING)
-          << "mkldnn_enabled_op_types specify the operator type list to "
-             "use MKLDNN acceleration. It is null in default, means "
-             "that all the operators supported by MKLDNN will be "
-             "accelerated. And it should not be set when "
-             "FLAGS_use_mkldnn=false.";
+      VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to "
+                 "use MKLDNN acceleration. It is null in default, means "
+                 "that all the operators supported by MKLDNN will be "
+                 "accelerated. And it should not be set when "
+                 "FLAGS_use_mkldnn=false.";
     }
 #else
     PADDLE_ENFORCE_NE(FLAGS_use_mkldnn, true,
@@ -403,26 +399,26 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
               << ", num_trainers:" << num_trainers_;
     } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fusion_group_pass") {
       pass->Set<bool>("use_gpu", new bool((use_device == p::kCUDA)));
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped.";
+        VLOG(1) << "fusion_group_pass is only supported on GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fuse_bn_act_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_bn_act_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_bn_act_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fuse_bn_add_act_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_bn_add_act_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_bn_add_act_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "mkldnn_placement_pass") {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 05c54a90f7eb02..628b9f0d70f598 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -205,7 +205,7 @@ class DeviceWorker {
   Scope* root_scope_ = nullptr;
   Scope* thread_scope_;
   paddle::platform::Place place_;
-  int64_t batch_num_;
+  int64_t batch_num_ = 0;
   FetchConfig fetch_config_;
   bool use_cvm_;
   bool no_cvm_;
@@ -562,7 +562,6 @@ class PSGPUWorker : public HogwildWorker {
   void ResetStat();
 
  protected:
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
   void PushGradients();
   void DumpParam();
   void CopySparseTable();
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100644
new mode 100755
index b36793507f54bf..e6a7d74cc43433
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -29,9 +29,24 @@ message RecomputeConfig {
 }
 
 message ShardingConfig {
-  optional float fuse_broadcast_MB = 1 [ default = 32.0 ];
-  optional bool hybrid_dp = 2 [ default = false ];
-  optional int32 sharding_group_size = 3 [ default = 8 ];
+  optional string sharding_segment_strategy = 1
+      [ default = 'segment_broadcast_MB' ];
+  optional float segment_broadcast_MB = 2 [ default = 32.0 ];
+  repeated string segment_anchors = 3;
+  optional int32 sharding_degree = 4 [ default = 8 ];
+  optional int32 mp_degree = 5 [ default = 1 ];
+  optional int32 dp_degree = 6 [ default = 1 ];
+  optional bool hybrid_dp = 7 [ default = false ];
+  optional int32 gradient_merge_acc_step = 8 [ default = 1 ];
+  optional bool optimize_offload = 9 [ default = false ];
+  optional bool pp_allreduce_in_optimize = 10 [ default = false ];
+  optional int32 pp_degree = 11 [ default = 1 ];
+}
+
+message HybridConfig {
+  optional int32 dp_degree = 1 [ default = -1 ];
+  optional int32 mp_degree = 2 [ default = 1 ];
+  optional int32 pp_degree = 3 [ default = 1 ];
 }
 
 message AMPConfig {
@@ -115,6 +130,7 @@ message AsyncConfig {
   optional bool launch_barrier = 9 [ default = true ];
   optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
   optional int32 lr_decay_steps = 11 [ default = 10 ];
+  optional int32 use_ps_gpu = 12 [ default = 0 ];
 }
 
 message PipelineConfig {
@@ -152,6 +168,7 @@ message DistributedStrategy {
   optional bool fp16_allreduce = 25 [ default = false ];
   optional bool sharding = 26 [ default = false ];
   optional float last_comm_group_size_MB = 27 [ default = 1 ];
+  optional bool find_unused_parameters = 28 [ default = true ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -164,6 +181,7 @@ message DistributedStrategy {
   optional LambConfig lamb_configs = 109;
   optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
   optional ShardingConfig sharding_configs = 111;
+  optional HybridConfig hybrid_configs = 112;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index a3fbb008fe4f44..b99ab6b5a7ff19 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -82,6 +82,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
         platform::errors::Unimplemented("platform::XPUPlace is not supported"));
   }
 
+  inline ::DLContext operator()(const platform::NPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
+  }
+
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ::DLContext ctx;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 0acc8a55fa9f8a..101991d2c1ba00 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -453,6 +453,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_npu_place(place_)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      // TODO(ascendrc): Support garbage collector on NPUPlace
+      VLOG(4) << "Skip NPU gc because it is not implemented now.";
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "No NPU gc found in CPU/GPU/XPU paddle"));
 #endif
     }
   }
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 61f3c026f1facc..c8517b9503741b 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -1,5 +1,10 @@
 if(WITH_PSLIB)
     cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib)
+else()
+    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
+endif(WITH_PSLIB)
+
+if(WITH_HETERPS)
     if(WITH_NCCL)
         nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
         DEPS heter_ps)
@@ -8,13 +13,10 @@ if(WITH_PSLIB)
         hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
         DEPS heter_ps)
         add_subdirectory(heter_ps)
-    else()
-        cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
     endif(WITH_NCCL)
 else()
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
     cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
-endif(WITH_PSLIB)
+endif(WITH_HETERPS)
 
 if(WITH_NCCL OR WITH_RCCL)
     cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
@@ -42,5 +44,5 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte
 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
 
 if(WITH_ASCEND)
-    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend ascend_graph)
+    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
 endif(WITH_ASCEND)
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index da79fccb8ca69f..baa2fd126a4b77 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -37,25 +37,50 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-// typedef std::vector<std::string> AscendGraphDesc;
 typedef ge::Graph AscendGraphDesc;
 
+#ifdef PADDLE_WITH_ASCEND_STRING
+using AscendString = ge::AscendString;
+#else
+using AscendString = std::string;
+#endif
+
 class AscendInstance {
  public:
   virtual ~AscendInstance() {}
   AscendInstance() {}
 
-  std::map<std::string, std::string> GetDefaultInitSessionOptions() {
-    std::map<std::string, std::string> init_options;
-    init_options["a"] = "b";
-    init_options["ge.trainFlag"] = "1";
+  std::map<AscendString, AscendString> _GetDefaultInitOptions() {
+    std::map<AscendString, AscendString> init_options;
+    init_options["ge.exec.deviceId"] = "0";
+    init_options["ge.graphRunMode"] = "1";
+    return init_options;
+  }
+
+  std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
+    std::map<AscendString, AscendString> init_options;
+    // init_options["a"] = "b";
+    // init_options["ge.trainFlag"] = "1";
     return init_options;
   }
 
-  // add other parameters here to init
+  ge::Status InitGEForUT() {
+    return ge::GEInitialize(_GetDefaultInitOptions());
+  }
+
   void InitGlobalResouces() {
-    session_.reset(new ge::Session(GetDefaultInitSessionOptions()));
-    VLOG(1) << "InitGlobalResouces Done";
+    LOG(INFO) << "Begin ascend InitGlobalResouces";
+    session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
+    if (session_ == nullptr) {
+      PADDLE_THROW(platform::errors::Fatal("new session error: nullptr"));
+    }
+    LOG(INFO) << "End ascend InitGlobalResouces";
+  }
+
+  void DestroyGlobalResouces() {
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
+    session_ = nullptr;
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
   }
 
   static std::shared_ptr<AscendInstance> GetInstance() {
@@ -178,6 +203,6 @@ class AscendInstance {
  private:
   static std::shared_ptr<AscendInstance> ascend_instance_;
 };
-}  // end namespace framework
-}  // end namespace paddle
+}  // namespace framework
+}  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index e584fb5e2b9ca7..613b2803637d2d 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -34,6 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/platform/type_defs.h"
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index a02931b3f5c28a..6f063e830c2da7 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -14,15 +14,21 @@ limitations under the License. */
 
 #pragma once
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <algorithm>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
+#ifdef PADDLE_WITH_PSLIB
 #include "common_value.h"  // NOLINT
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#endif
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -39,7 +45,12 @@ class HeterContext {
   }
   Scope* scope_{nullptr};
   std::vector<std::vector<FeatureKey>> feature_keys_;
+#ifdef PADDLE_WITH_PSLIB
   std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> value_ptr_;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+  std::vector<std::vector<paddle::distributed::VALUE*>> value_ptr_;
+#endif
   std::vector<std::vector<FeatureValue>> device_values_;
   std::vector<std::vector<FeatureKey>> device_keys_;
   std::vector<std::mutex*> mutex_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index 698ece09de6c50..c3bf33b32c2daf 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 #include <iostream>
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index e5c0972763bede..089130f6da8c73 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -17,11 +17,17 @@ limitations under the License. */
 #include <limits>
 #include <memory>
 #include <vector>
+#ifdef PADDLE_WTIH_PSLIB
 #include "common_value.h"  // NOLINT
+#endif
+#ifdef PADDLE_WITH_PSCORE
+#endif
 #include "thrust/pair.h"
 //#include "cudf/concurrent_unordered_map.cuh.h"
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#include "paddle/fluid/platform/type_defs.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index 871f9c7857af46..098c795fc7e1f9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -119,6 +119,7 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
       continue;
     }
     ValType& gpu_val = kv[i].second;
+#ifdef PADDLE_WITH_PSLIB
     auto* downpour_value =
         (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
     int downpour_value_size = downpour_value->size();
@@ -138,6 +139,14 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
         cpu_val[x + 7] = gpu_val.mf[x];
       }
     }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
+    downpour_value->count_ = gpu_val.show;
+    for (int x = 0; x < gpu_val.mf_size; x++) {
+      downpour_value->data_[x] = gpu_val.mf[x];
+    }
+#endif
   }
 
   container_->prefetch(devid, stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 0e38ebbd7f4e72..2ec2a8a1f1e223 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "thrust/pair.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -182,7 +182,7 @@ class HeterComm {
   std::vector<std::vector<Path>> path_;
   std::vector<LocalStorage> storage_;
   int feanum_{1800 * 2048};
-  int multi_node_{1};
+  int multi_node_{0};
   std::vector<ncclComm_t> nccl_inner_comms_;
   std::vector<ncclComm_t> nccl_inter_comms_;
   int node_size_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 2f1c809c01eaad..1b4205e3c38fe2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#ifdef PADDLE_WITH_HETERPS
 #include <queue>
 
-#ifdef PADDLE_WITH_PSLIB
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index f2e129ded9fefc..581b0d511c23ee 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -54,8 +54,8 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
 
 void HeterPs::push_sparse(int num, FeatureKey* d_keys,
                           FeaturePushValue* d_grads, size_t len) {
-  // comm_->push_sparse(num, d_keys, d_grads, len, opt_);
-  comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
+  comm_->push_sparse(num, d_keys, d_grads, len, opt_);
+  // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }
 
 void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 142f4a93b93a29..d78b6b492074de 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 7980220eab9b9b..05b3ecf9c3c12c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
index f65b664f83ba0d..0f2af2a522e287 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 #include "heter_resource.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index ad7649a8a33cb7..7b23379994c735 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index b3ec9e752e62bb..7e82a8e014fd3c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 4274876c9975e5..b7bb5110744649 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -26,8 +26,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <algorithm>
 #include <deque>
@@ -58,7 +57,12 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto& device_mutex = gpu_task->mutex_;
 
   std::vector<std::thread> threads;
+#ifdef PADDLE_WITH_PSLIB
   auto fleet_ptr = FleetWrapper::GetInstance();
+#endif
+#ifdef PADDLE_WITH_PSCORE
+  auto fleet_ptr = paddle::distributed::Communicator::GetInstance();
+#endif
 
   // data should be in input channel
   thread_keys_.resize(thread_keys_thread_num_);
@@ -124,9 +128,16 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto ptl_func = [this, &local_keys, &local_ptr, &table_id,
                    &fleet_ptr](int i) {
     size_t key_size = local_keys[i].size();
+#ifdef PADDLE_WITH_PSLIB
     auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
         reinterpret_cast<char**>(local_ptr[i].data()), table_id,
         local_keys[i].data(), key_size);
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr(
+        reinterpret_cast<char**>(local_ptr[i].data()), table_id,
+        local_keys[i].data(), key_size);
+#endif
     tt.wait();
     auto status = tt.get();
     // auto status = 0;
@@ -153,8 +164,14 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto build_func = [device_num, &local_keys, &local_ptr, &device_keys,
                      &device_vals, &device_mutex](int i) {
     std::vector<std::vector<FeatureKey>> task_keys(device_num);
+#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
         device_num);
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    std::vector<std::vector<paddle::distributed::VALUE*>> task_ptrs(device_num);
+#endif
 
     for (size_t j = 0; j < local_keys[i].size(); j++) {
       int shard = local_keys[i][j] % device_num;
@@ -169,7 +186,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
       int cur = device_keys[dev].size();
       device_keys[dev].resize(device_keys[dev].size() + len);
       device_vals[dev].resize(device_vals[dev].size() + len);
-
+#ifdef PADDLE_WITH_PSLIB
       for (int j = 0; j < len; ++j) {
         device_keys[dev][cur + j] = task_keys[dev][j];
         float* ptr_val = task_ptrs[dev][j]->data();
@@ -196,6 +213,35 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
           }
         }
       }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      for (int j = 0; j < len; ++j) {
+        device_keys[dev][cur + j] = task_keys[dev][j];
+        distributed::VALUE* ptr_val = task_ptrs[dev][j];
+        FeatureValue& val = device_vals[dev][cur + j];
+        bool has_mf = 1;
+        val.delta_score = 0;
+        val.show = ptr_val->count_;
+        val.clk = 0;
+        val.slot = 0;
+        val.lr = 0;
+        val.lr_g2sum = 0;
+        val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
+
+        if (has_mf) {
+          val.mf_size = MF_DIM + 1;
+          for (int x = 0; x < val.mf_size; x++) {
+            val.mf[x] = ptr_val->data_[x];
+          }
+        } else {
+          val.mf_size = 0;
+          for (int x = 0; x < MF_DIM + 1; x++) {
+            val.mf[x] = 0;
+          }
+        }
+      }
+#endif
+      VLOG(1) << "GpuPs build hbmps done";
 
       device_mutex[dev]->unlock();
     }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 2eedcd5f1c7005..2bf564d3f76d5a 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 #include <algorithm>
 #include <ctime>
 #include <memory>
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index ef586b41fe05d2..cfb23d1be2acfe 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <atomic>
 #include <ctime>
@@ -26,7 +25,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
@@ -42,6 +40,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/service/communicator.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -219,7 +220,7 @@ class PSGPUWrapper {
   std::shared_ptr<HeterPsResource> resource_;
   int32_t sleep_seconds_before_fail_exit_;
   std::vector<int> slot_vector_;
-  int multi_node_{1};
+  int multi_node_{0};
   int node_size_;
   std::vector<ncclComm_t> inner_comms_;
   std::vector<ncclComm_t> inter_comms_;
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index c8b6c764255175..8dfbd3c268b866 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -86,8 +86,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
   PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_));
 #else
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
+  callback_manager_.reset(
+      new platform::StreamCallbackManager<gpuStream_t>(stream_));
 #endif
-  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
 }
 
 StreamGarbageCollector::~StreamGarbageCollector() {
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 97800865af861f..572c79d21a045b 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector {
 
  private:
   gpuStream_t stream_;
-  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
+  std::unique_ptr<platform::StreamCallbackManager<gpuStream_t>>
+      callback_manager_;
 };
 
 class CUDAPinnedGarbageCollector : public GarbageCollector {
diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h
index 8f52235c962445..3f65eaf3aa1216 100644
--- a/paddle/fluid/framework/heter_service.h
+++ b/paddle/fluid/framework/heter_service.h
@@ -30,10 +30,12 @@ limitations under the License. */
 #include "brpc/controller.h"
 #include "brpc/server.h"
 #include "paddle/fluid/platform/timer.h"
+#endif
 
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_PSLIB
 typedef std::function<int(const HeterRequest*, HeterResponse*)>
     HeterServiceHandler;
 class DataFeed;
@@ -142,7 +144,7 @@ class HeterTask {
   double cpu_2_gpu_time{0};
   platform::Timer timeline;
 };
-
+#endif
 template <class T>
 class HeterObjectPool {
  public:
@@ -153,7 +155,7 @@ class HeterObjectPool {
     if (pool_.empty()) {
       num_ += 1;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      VLOG(0) << "pool construct size: " << num_;
+      VLOG(3) << "pool construct size: " << num_;
 #endif
       return std::make_shared<T>();
     } else {
@@ -178,6 +180,7 @@ class HeterObjectPool {
   int num_{0};
 };
 
+#ifdef PADDLE_WITH_PSLIB
 struct BthreadMutextGuard {
   BthreadMutextGuard(bthread_mutex_t* rho) {
     mutex_ = rho;
@@ -258,7 +261,6 @@ class HeterList {
     std::unique_lock<std::mutex> lock(mutex_);
     cond_.wait(lock, [this] { return size < cap_; });
     if (task_map_.find(key) != task_map_.end()) {
-      // std::cout << "try put key=" << key << " false" << std::endl;
       task_map_.erase(key);
       return false;
     } else {
@@ -267,7 +269,6 @@ class HeterList {
       node->value = value;
       map_[node->key] = node;
       attach(node);
-      // std::cout << "try put key=" << key << " true" << std::endl;
       return true;
     }
   }
@@ -276,7 +277,6 @@ class HeterList {
     std::unique_lock<std::mutex> lock(mutex_);
     cond_.wait(lock, [this] { return size < cap_; });
     HeterNode<K, T>* node = new HeterNode<K, T>;
-    // std::cout << "put key=" << key << " true" << std::endl;
     node->key = key;
     node->value = value;
     map_[node->key] = node;
@@ -288,7 +288,6 @@ class HeterList {
     std::lock_guard<std::mutex> lock(mutex_);
     auto iter = map_.find(key);
     if (iter != map_.end()) {
-      // std::cout << "try get key=" << key << " true" << std::endl;
       HeterNode<K, T>* node = iter->second;
       detach(node);
       cond_.notify_one();
@@ -298,7 +297,6 @@ class HeterList {
       return ret;
     }
     task_map_.insert(key);
-    // std::cout << "try get key=" << key << " false" << std::endl;
     return nullptr;
   }
 
@@ -306,7 +304,6 @@ class HeterList {
     std::lock_guard<std::mutex> lock(mutex_);
     auto iter = map_.find(key);
     if (iter != map_.end()) {
-      // std::cout << "get key=" << key << " true" << std::endl;
       HeterNode<K, T>* node = iter->second;
       detach(node);
       cond_.notify_one();
@@ -315,7 +312,6 @@ class HeterList {
       delete node;
       return ret;
     }
-    // std::cout << "get key=" << key << " false" << std::endl;
     return nullptr;
   }
 
@@ -323,14 +319,12 @@ class HeterList {
     std::lock_guard<std::mutex> lock(mutex_);
     HeterNode<K, T>* node = head_->next;
     if (node == tail_) {
-      // std::cout << "get2 false" << std::endl;
       return nullptr;
     } else {
       detach(node);
       cond_.notify_one();
       T ret = std::move(node->value);
       map_.erase(node->key);
-      // std::cout << "get2 key=" << node->key << " true" << std::endl;
       delete node;
       return ret;
     }
@@ -371,7 +365,7 @@ class HeterList {
   int cap_;
   int size;
 };
+#endif
 
 }  // namespace framework
 }  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index d8639643f2c8a7..89dc5c7d3ea932 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <ctime>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
@@ -226,14 +227,32 @@ void HogwildWorker::PrintFetchVars() {
   // call count
   batch_num_++;
   int batch_per_print = fetch_config_.print_period();
-  if (thread_id_ == 0) {
-    if (batch_num_ % batch_per_print == 0) {
-      int fetch_var_num = fetch_config_.fetch_var_names_size();
-      for (int i = 0; i < fetch_var_num; ++i) {
-        platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i),
-                           fetch_config_.fetch_var_str_format(i));
+  int fetch_var_num = fetch_config_.fetch_var_names_size();
+
+  if (fetch_var_num == 0) {
+    return;
+  }
+
+  if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) {
+    time_t curtime;
+    time(&curtime);
+    char mbstr[80];
+    std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%d %H:%M:%S",
+                  std::localtime(&curtime));
+
+    std::stringstream ss;
+    ss << "time: [" << mbstr << "], ";
+    ss << "batch: [" << batch_num_ << "], ";
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i),
+                         fetch_config_.fetch_var_str_format(i), &ss);
+      if (i < fetch_var_num - 1) {
+        ss << ", ";
       }
     }
+
+    std::cout << ss.str() << std::endl;
   }
 }
 
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 84c6b03e76bc1e..59d071e1034590 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -34,15 +34,19 @@ namespace patterns {
 static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name,
                                const std::string& arg,
                                bool is_persist = false) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   PDNode* node =
-      pattern->NewNode(name)->assert_is_op_input("lookup_table", arg);
+      pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg);
   if (is_persist) return node->assert_is_persistable_var();
   return node;
 }
 static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name,
                                    const std::string& arg) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   PDNode* node = pattern->NewNode(name)
-                     ->assert_is_only_output_of_op("lookup_table")
+                     ->assert_is_only_output_of_ops(embedding_ops)
                      ->assert_is_op_input("elementwise_add", arg)
                      ->AsIntermediate();
   return node;
@@ -56,10 +60,12 @@ void Embedding2Eltwise1Pattern::operator()() {
       create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
   auto* lookup_table2_w =
       create_emb_vars(pattern, lookup_table2_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table2 =
-      pattern->NewNode(lookup_table2_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table1_out =
       create_emb_out_vars(pattern, lookup_table1_out_repr(), "X");
   auto* lookup_table2_out =
@@ -80,8 +86,10 @@ void Embedding1Eltwise1Pattern::operator()() {
       create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
   auto* lookup_table1_w =
       create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table1_out =
       create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y");
   auto* eltwise_add =
@@ -291,6 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
     new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()});
     new_op_desc.SetAttr("epsilon",
                         end_patter_layernorms[k]->Op()->GetAttr("epsilon"));
+
+    if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) {
+      new_op_desc.SetAttr("enable_int8", true);
+    }
+
     auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc);
 
     for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) {
@@ -347,4 +360,5 @@ REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("lookup_table", 0)
+            .LE("lookup_table_v2", 1)
             .EQ("elementweise_add", 0));
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index deb182c0fbe19c..d74e8e5f65cd20 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -652,6 +652,36 @@ PDNode *PDNode::assert_is_ops_input(
   return this;
 }
 
+PDNode *PDNode::assert_is_only_input_of_ops(
+    const std::unordered_set<std::string> &op_types) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node *x) {
+    for (auto *op : x->outputs) {
+      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
+          op->inputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+
+PDNode *PDNode::assert_is_only_output_of_ops(
+    const std::unordered_set<std::string> &op_types) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node *x) {
+    for (auto *op : x->inputs) {
+      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
+          op->outputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+
 bool VarLinksToOp(Node *node, const std::string &op_type) {
   for (auto *out : node->outputs) {
     if (out->IsOp() && out->Op()->Type() == op_type) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index b6c1074d90dd2a..cfac01ec9dedc8 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -145,6 +145,11 @@ struct PDNode {
       const std::unordered_set<std::string>& op_types,
       const std::string& argument, int nth);
 
+  PDNode* assert_is_only_input_of_ops(
+      const std::unordered_set<std::string>& op_types);
+  PDNode* assert_is_only_output_of_ops(
+      const std::unordered_set<std::string>& op_types);
+
   PDNode* assert_has_n_inputs(size_t n);
   PDNode* assert_has_n_outputs(size_t n);
 
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
index a2443c86986ec8..c36123f65f6644 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -57,7 +57,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
     std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
     size_t x_rank = x_shape.size();
     size_t y_rank = y_shape.size();
-    flag = flag && x_rank == 2 && y_rank == 2;
+    flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2;
 
     std::vector<Node*>& next_ops = matmul_out->outputs;
     flag = flag && next_ops.size() == 1 &&
@@ -69,7 +69,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
       desc.SetInput("X", {matmul_in_x->Name()});
       desc.SetInput("Y", {matmul_in_y->Name()});
       desc.SetOutput("Out", {matmul_out->Name()});
-      desc.SetAttr("x_num_col_dims", 1);
+      desc.SetAttr("x_num_col_dims", static_cast<int>(x_rank - 1));
       desc.SetAttr("y_num_col_dims", 1);
       if (matmul_op->Op()->HasAttr("enable_int8")) {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index e20c0667ec3bc2..1e8349e878781d 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
     multihead_op_desc.SetAttr("alpha", scale_attr);
     multihead_op_desc.SetAttr("head_number", head_number);
 
+    auto* mul0_op_desc = mul0->Op();
+    auto* mul1_op_desc = mul1->Op();
+    auto* mul2_op_desc = mul2->Op();
+    if (mul0_op_desc->HasAttr("enable_int8")) {
+      multihead_op_desc.SetAttr("enable_int8",
+                                mul0_op_desc->GetAttr("enable_int8"));
+      // all mul op has same input.
+      multihead_op_desc.SetAttr("Input_scale",
+                                mul0_op_desc->GetAttr("X_scale"));
+      auto weight_scale0 = BOOST_GET_CONST(
+          std::vector<float>, mul0_op_desc->GetAttr("weight_scale"));
+      auto weight_scale1 = BOOST_GET_CONST(
+          std::vector<float>, mul1_op_desc->GetAttr("weight_scale"));
+      auto weight_scale2 = BOOST_GET_CONST(
+          std::vector<float>, mul2_op_desc->GetAttr("weight_scale"));
+      auto weight_max = std::max(weight_scale0, weight_scale1);
+      weight_max = std::max(weight_max, weight_scale2);
+      multihead_op_desc.SetAttr("weight_scale", weight_max);
+
+      if (mul0_op_desc->HasAttr("out_threshold")) {
+        auto out_scale0 =
+            BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold"));
+        auto out_scale1 =
+            BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold"));
+        auto out_scale2 =
+            BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold"));
+        auto out_scale_max = std::max(out_scale0, out_scale1);
+        out_scale_max = std::max(out_scale_max, out_scale2);
+        multihead_op_desc.SetAttr("out_threshold", out_scale_max);
+      }
+    }
+
     auto* multihead = graph->CreateOpNode(&multihead_op_desc);
 
     IR_NODE_LINK_TO(input0, multihead);
@@ -682,6 +714,447 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
   return fusion_count;
 }
 
+PDNode* MultiHeadMatmulV3Pattern::operator()() {
+  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("matmul");
+
+  // First path with scale
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul");
+  auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul0_out_var =
+      pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul0) eltadd0;
+  decltype(mul0) eltadd0_b_var;
+  decltype(mul0) eltadd0_out_var;
+
+  mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_0_out_var =
+      pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2");
+  reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul");
+
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add");
+  eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var =
+      pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
+  softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
+                                   ->assert_is_op_output("reshape2");
+  reshape2_qkv_out_var->assert_is_op_input("matmul");
+
+  // Second path to matmul
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul");
+  auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul1_out_var =
+      pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul1) eltadd1;
+  decltype(mul1) eltadd1_b_var;
+  decltype(mul1) eltadd1_out_var;
+
+  mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
+  eltadd1_b_var = pattern->NewNode(eltadd1_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd1_out_var = pattern->NewNode(eltadd1_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_1 =
+      pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_1_out_var =
+      pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2");
+  reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_1 =
+      pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
+  auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
+      "matmul");  // link to matmul qk
+
+  // Third path to matmul
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul");
+  auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul2_out_var =
+      pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul2) eltadd2;
+  decltype(mul2) eltadd2_b_var;
+  decltype(mul2) eltadd2_out_var;
+
+  mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
+  eltadd2_b_var = pattern->NewNode(eltadd2_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd2_out_var = pattern->NewNode(eltadd2_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_2 =
+      pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_2_out_var =
+      pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2");
+  reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_2 =
+      pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
+  auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
+      matmul_ops);  // link to matmul qkv
+
+  // Q path
+  mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
+  eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var});
+
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  // K path
+  mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var});
+  eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var});
+  reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var});
+  transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
+  // compute q*k
+  matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+  // V  path
+  mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var});
+  eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var});
+  reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var});
+  transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
+  // compute q*k*v
+  matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+
+  return transpose2_2_out_var;
+}
+
+static int BuildFusionV3(Graph* graph, const std::string& name_scope,
+                         Scope* scope) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  MultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope);
+
+  multihead_pattern();
+  // Create New OpDesc
+  auto fuse_creater = [&](
+      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
+      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
+      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
+      Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) {
+    auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha"));
+
+    // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
+    // bias (B * S * 3 * N * H) + bias (3 * N * H)
+    // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H)
+    auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable<LoDTensor>();
+    auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable<LoDTensor>();
+    auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable<LoDTensor>();
+
+    auto* bq_tensor =
+        scope->FindVar(eltadd0_b->Name())->GetMutable<LoDTensor>();
+    auto* bk_tensor =
+        scope->FindVar(eltadd1_b->Name())->GetMutable<LoDTensor>();
+    auto* bv_tensor =
+        scope->FindVar(eltadd2_b->Name())->GetMutable<LoDTensor>();
+
+    auto* wq_data = wq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wk_data = wk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wv_data = wv_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bq_data = bq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bk_data = bk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
+
+    auto combined_w_dims =
+        framework::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = framework::make_ddim({3, bq_tensor->dims()[0]});
+
+    // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
+    auto* combined_w_desc = mul0_w->Var();
+    combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    combined_w_desc->SetPersistable(true);
+
+    auto* combined_bias_desc = eltadd0_b->Var();
+    combined_bias_desc->SetShape({3, bq_tensor->dims()[0]});
+    combined_bias_desc->SetPersistable(true);
+
+    framework::LoDTensor tmp_combined_w_tensor;
+    tmp_combined_w_tensor.Resize(combined_w_dims);
+    auto* tmp_combined_w_data =
+        tmp_combined_w_tensor.mutable_data<float>(platform::CPUPlace());
+
+    std::vector<float*> w_vec = {wq_data, wk_data, wv_data};
+    int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2];
+    // Combine the three fc weights together.
+    for (int i = 0; i < dims_h; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < dims_w; k++) {
+          int out_index = i * (3 * dims_w) + j * dims_w + k;
+          int in_index = i * dims_w + k;
+          tmp_combined_w_data[out_index] = w_vec[j][in_index];
+        }
+      }
+    }
+
+    wq_tensor->Resize(combined_w_dims);
+    auto* new_combined_w_data =
+        wq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_w_data, tmp_combined_w_data,
+           sizeof(float) * wq_tensor->numel());
+
+    scope->EraseVars({mul1_w->Name(), mul2_w->Name()});
+
+    framework::LoDTensor tmp_combined_bias_tensor;
+    tmp_combined_bias_tensor.Resize(combined_bias_dims);
+    auto* tmp_combined_bias_data =
+        tmp_combined_bias_tensor.mutable_data<float>(platform::CPUPlace());
+
+    size_t bias_size = bq_tensor->numel();
+    memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + bias_size, bk_data,
+           sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data,
+           sizeof(float) * bias_size);
+
+    bq_tensor->Resize(combined_bias_dims);
+    auto* new_combined_bias_data =
+        bq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_bias_data, tmp_combined_bias_data,
+           sizeof(float) * bq_tensor->numel());
+
+    scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()});
+
+    auto reshape_desc = reshape2->Op();
+    int head_number =
+        BOOST_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape")).at(2);
+
+    OpDesc multihead_op_desc;
+    multihead_op_desc.SetType("multihead_matmul");
+
+    multihead_op_desc.SetInput("Input", {input0->Name()});
+    multihead_op_desc.SetInput("W", {mul0_w->Name()});
+    multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()});
+    multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()});
+
+    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
+    multihead_op_desc.SetAttr("alpha", scale_attr);
+    multihead_op_desc.SetAttr("head_number", head_number);
+
+    auto* multihead = graph->CreateOpNode(&multihead_op_desc);
+
+    IR_NODE_LINK_TO(input0, multihead);
+    IR_NODE_LINK_TO(mul0_w, multihead);
+    IR_NODE_LINK_TO(eltadd0_b, multihead);
+    IR_NODE_LINK_TO(eltadd_qk_b, multihead);
+
+    IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out,
+                              multihead_pattern);
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
+                              multihead_pattern);
+
+    // If weights or biases in qkv's fc are shared by multiple multihead_matmul
+    // patterns, we do not support this kind of fusion, this pass will not take
+    // effect.
+    bool is_fc_params_shared =
+        mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
+        mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
+        eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
+    if (is_fc_params_shared) {
+      return;
+    }
+    fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
+                 mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
+                 reshape2_0, reshape2_qkv_out, matmul_qk);
+
+    std::unordered_set<const Node*> marked_nodes({eltadd0,
+                                                  eltadd1,
+                                                  eltadd2,
+                                                  eltadd1_b,
+                                                  eltadd2_b,
+                                                  eltadd0_out,
+                                                  eltadd1_out,
+                                                  eltadd2_out,
+                                                  reshape2_0,
+                                                  reshape2_1,
+                                                  reshape2_2,
+                                                  reshape2_0_out,
+                                                  reshape2_1_out,
+                                                  reshape2_2_out,
+                                                  transpose2_0,
+                                                  transpose2_1,
+                                                  transpose2_2,
+                                                  transpose2_0_out,
+                                                  transpose2_1_out,
+                                                  transpose2_2_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  mul0,
+                                                  mul1,
+                                                  mul2,
+                                                  mul0_out,
+                                                  mul1_out,
+                                                  mul2_out,
+                                                  mul1_w,
+                                                  mul2_w,
+                                                  reshape2_qkv});
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
 }  // namespace patterns
 
 void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
@@ -706,6 +1179,21 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
+void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal(
+          "During the multiheadMatmul pass, The scope should not be null."));
+
+  int fusion_count = patterns::BuildFusionV3(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kMultiheadMatmulPass, new bool(true));
+  }
+  AddStatis(fusion_count);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
@@ -715,6 +1203,8 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
 
 REGISTER_PASS(multihead_matmul_fuse_pass_v2,
               paddle::framework::ir::MultiHeadMatmulV2FusePass);
+REGISTER_PASS(multihead_matmul_fuse_pass_v3,
+              paddle::framework::ir::MultiHeadMatmulV3FusePass);
 REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
@@ -725,3 +1215,13 @@ REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
             .EQ("scale", 0)
             .LE("matmul", 1)
             .EQ("softmax", 0));
+REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v3)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
index f5327dc71080be..c7f1336211d346 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
@@ -89,9 +89,63 @@ struct MultiHeadMatmulPattern : public PatternBase {
   PATTERN_DECL_NODE(matmul_qkv);
   PATTERN_DECL_NODE(matmul_qkv_out);
 };
+
+struct MultiHeadMatmulV3Pattern : public PatternBase {
+  MultiHeadMatmulV3Pattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul_v3") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(mul0);
+  PATTERN_DECL_NODE(mul1);
+  PATTERN_DECL_NODE(mul2);
+  PATTERN_DECL_NODE(mul0_w);
+  PATTERN_DECL_NODE(mul1_w);
+  PATTERN_DECL_NODE(mul2_w);
+  PATTERN_DECL_NODE(mul0_out);
+  PATTERN_DECL_NODE(mul1_out);
+  PATTERN_DECL_NODE(mul2_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(eltadd1_out);
+  PATTERN_DECL_NODE(eltadd2_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_1);
+  PATTERN_DECL_NODE(reshape2_2);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(reshape2_1_out);
+  PATTERN_DECL_NODE(reshape2_2_out);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_1);
+  PATTERN_DECL_NODE(transpose2_2);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_0_out);
+  PATTERN_DECL_NODE(transpose2_1_out);
+  PATTERN_DECL_NODE(transpose2_2_out);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+};
+
 }  // namespace patterns
 
-// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
 class MultiHeadMatmulFusePass : public FusePassBase {
  public:
   virtual ~MultiHeadMatmulFusePass() {}
@@ -112,6 +166,16 @@ class MultiHeadMatmulV2FusePass : public FusePassBase {
   const std::string name_scope_{"multihead_matmul_fuse_v2"};
 };
 
+class MultiHeadMatmulV3FusePass : public FusePassBase {
+ public:
+  virtual ~MultiHeadMatmulV3FusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"multihead_matmul_fuse_v3"};
+};
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index ada20113077c18..232e1d8da4ded3 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -141,14 +141,6 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
                               fused_pattern);
 
-    // check if is in ernie or not
-    if (!graph->Has(kEmbEltwiseLayernormPass) ||
-        !graph->Has(kMultiheadMatmulPass)) {
-      LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in "
-                << "Ernie/Bert model. Just skip this pass.";
-      return;
-    }
-
     std::unordered_set<const Node *> del_node_set;
 
     // Create an SkipLayerNorm op node
@@ -161,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     new_desc.SetInput("Scale", {layer_norm_scale->Name()});
     new_desc.SetInput("Bias", {layer_norm_bias->Name()});
 
+    if (elementwise->Op()->HasAttr("out_threshold")) {
+      new_desc.SetAttr("enable_int8", true);
+    }
+
     // outputs
     new_desc.SetOutput("Out", {layer_norm_out->Name()});
 
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index 4307e51862df57..8fe314cf5f18c5 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     return LibraryType::kPlain;
   } else if (s == std::string("XPU")) {
     return LibraryType::kPlain;
+  } else if (s == std::string("NPU")) {
+    return LibraryType::kPlain;
   } else if (s == std::string("CUDA")) {
     return LibraryType::kPlain;
   } else {
diff --git a/paddle/fluid/framework/load_op_lib.h b/paddle/fluid/framework/load_op_lib.h
deleted file mode 100644
index 16cffe119d63e0..00000000000000
--- a/paddle/fluid/framework/load_op_lib.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-T *DynLoad(void *handle, std::string name) {
-  T *func = reinterpret_cast<T *>(dlsym(handle, name.c_str()));
-#if !defined(_WIN32)
-  auto errorno = dlerror();
-#else
-  auto errorno = GetLastError();
-#endif  // !_WIN32
-  PADDLE_ENFORCE_NOT_NULL(
-      func,
-      platform::errors::NotFound(
-          "Failed to load dynamic operator library, error code(%s).", errorno));
-  return func;
-}
-
-void LoadOpLib(const std::string &dso_name) {
-  void *handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
-
-  typedef OpInfoMap &get_op_info_t();
-  get_op_info_t *get_op_info =
-      DynLoad<get_op_info_t>(handle, "PD_GetOpInfoMap");
-  auto &op_info = get_op_info();
-  auto *dyn_info_map = op_info.mutable_map();
-
-  typedef std::vector<std::string> grad_op_desc_maker_t(
-      const OpDesc &, const std::unordered_set<std::string> &,
-      std::unordered_map<std::string, std::string> *,
-      const std::vector<BlockDesc *> &);
-
-  grad_op_desc_maker_t *grad_op_desc_maker =
-      DynLoad<grad_op_desc_maker_t>(handle, "PD_GetGradOpDescStrs");
-
-  auto &info_map = OpInfoMap::Instance();
-  for (const auto &n : *(dyn_info_map)) {
-    auto type = n.first;
-    if (type == "recurrent" || type == "recurrent_grad" ||
-        type == "conditional_block" || type == "conditional_block_grad") {
-      continue;
-    }
-    PADDLE_ENFORCE_NE(info_map.Has(n.first), true,
-                      platform::errors::AlreadyExists(
-                          "Operator (%s) has been registered.", type));
-    OpInfo info;
-    info.creator_ = n.second.creator_;
-
-    // If get the protocol buffer from dynamic library directly, there
-    // will be deconstruction error
-    // ** Error in `python`: free(): invalid pointer:
-    //  ...  paddle::framework::proto::OpDesc::SharedDtor()
-    // It seems a bug in protobuf, see
-    // https://github.com/protocolbuffers/protobuf/issues/435
-    // So, get the serialized binary string from dynamic library,
-    // then deserialize to protocol buffer.
-    info.grad_op_maker_ = [grad_op_desc_maker](
-        const OpDesc &op_desc,
-        const std::unordered_set<std::string> &no_grad_set,
-        std::unordered_map<std::string, std::string> *grad_to_var,
-        const std::vector<BlockDesc *> &grad_block) {
-      std::vector<std::string> strs =
-          grad_op_desc_maker(op_desc, no_grad_set, grad_to_var, grad_block);
-      std::vector<std::unique_ptr<OpDesc>> ret;
-      for (auto &str : strs) {
-        proto::OpDesc proto_desc;
-        PADDLE_ENFORCE_EQ(proto_desc.ParseFromString(str), true,
-                          platform::errors::InvalidArgument(
-                              "Failed to parse OpDesc from string."));
-        ret.emplace_back(new OpDesc(proto_desc, nullptr));
-      }
-      return ret;
-    };
-    info.proto_ = n.second.proto_;
-    info.checker_ = n.second.checker_;
-    info.infer_var_type_ = n.second.infer_var_type_;
-    info.infer_shape_ = n.second.infer_shape_;
-    info.infer_inplace_ = n.second.infer_inplace_;
-    info.infer_no_need_buffer_vars_ = n.second.infer_no_need_buffer_vars_;
-    info.use_default_grad_op_desc_maker_ =
-        n.second.use_default_grad_op_desc_maker_;
-    info.use_empty_grad_op_desc_maker_ = n.second.use_empty_grad_op_desc_maker_;
-
-    info_map.Insert(type, info);
-  }
-
-  typedef void init_device_t(platform::DeviceContextPool *);
-  init_device_t *init_dev =
-      DynLoad<init_device_t>(handle, "PD_InitDevicesPool");
-  init_dev(&(platform::DeviceContextPool::Instance()));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index ff8e71b92e0ac5..198bb65863bb6a 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -38,6 +38,13 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
     need_merge_var_names_.push_back(
         trainer_desc.downpour_param().stat_var_names(i));
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (int i = 0; i < thread_num_; ++i) {
+    int num = trainer_desc.worker_places(i);
+    platform::CUDAPlace place = platform::CUDAPlace(num);
+    places_.push_back(place);
+  }
+#endif
   // get filelist from trainer_desc here
   const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
@@ -102,13 +109,42 @@ void MultiTrainer::InitDumpEnv() {
 void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
                                   const platform::Place& place) {
   for (int i = 0; i < thread_num_; ++i) {
+#ifdef PADDLE_WITH_HETERPS
+    workers_[i]->SetPlace(places_[i]);
+    workers_[i]->SetReaderPlace(places_[i]);
+#else
     workers_[i]->SetPlace(place);
     workers_[i]->SetReaderPlace(place);
+#endif
     workers_[i]->SetRootScope(root_scope_);
     workers_[i]->CreateDeviceResource(main_program);  // Program
     workers_[i]->BindingDataFeedMemory();
     workers_[i]->CacheProgram(main_program);
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (int num = 0; num < thread_num_; ++num) {
+    auto place = places_[num];
+    Scope* scope = workers_[num]->GetThreadScope();
+    auto& block = main_program.Block(0);
+    for (auto& var : block.AllVars()) {
+      if (var->Persistable()) {
+        auto name = var->Name();
+        Variable* root_var = root_scope_->FindVar(name);
+        if (!root_var) {
+          continue;
+        }
+        if (root_var->IsType<SelectedRows>()) {
+          continue;
+        }
+        LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+        auto* ptr = scope->Var(name);
+        InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
+        LoDTensor* thread_tensor = ptr->GetMutable<LoDTensor>();
+        TensorCopy(*root_tensor, place, thread_tensor);
+      }
+    }
+  }
+#endif
 }
 
 void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -138,10 +174,77 @@ void MultiTrainer::Run() {
   }
 }
 
+#ifdef PADDLE_WITH_HETERPS
+void MultiTrainer::MergeDenseParam() {
+  auto communicator = paddle::distributed::Communicator::GetInstance();
+  auto& recv_ctx = communicator->GetRecvCtxMap();
+  Scope* thread_scope = workers_[0]->GetThreadScope();
+  for (auto& iter : recv_ctx) {
+    auto& varnames = iter.second;
+    for (auto& name : varnames) {
+      Variable* root_var = root_scope_->FindVar(name);
+      LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+      Variable* var = thread_scope->FindVar(name);
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+      TensorCopy((*tensor), root_tensor->place(), root_tensor);
+    }
+  }
+}
+#endif
+
+template <typename T>
+void MultiTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) {
+  LoDTensor tmp_root;
+  TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
+  T* tmp_root_data = tmp_root.data<T>();
+  LoDTensor tmp_tensor;
+  TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
+  T* data = tmp_tensor.data<T>();
+  for (int i = 0; i < tmp_tensor.numel(); i++) {
+    tmp_root_data[i] += data[i];
+  }
+  TensorCopy(tmp_root, platform::CPUPlace(), root_tensor);
+}
+
 void MultiTrainer::Finalize() {
   if (need_dump_field_ || need_dump_param_) {
     FinalizeDumpEnv();
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (size_t i = 0; i < need_merge_var_names_.size(); i++) {
+    Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]);
+    if (root_var == nullptr) {
+      continue;
+    }
+    LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+
+    for (size_t j = 0; j < places_.size(); j++) {
+      Scope* cur_thread_scope = workers_[j]->GetThreadScope();
+      Variable* thread_var =
+          cur_thread_scope->FindVar(need_merge_var_names_[i]);
+      if (thread_var == nullptr) {
+        continue;
+      }
+      LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
+#define MergeCallback(cpp_type, proto_type)                                    \
+  do {                                                                         \
+    if (root_tensor->type() == proto_type) {                                   \
+      if (thread_tensor->type() != proto_type) {                               \
+        VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
+                << "] " << need_merge_var_names_[i]                            \
+                << ", root tensor type=" << root_tensor->type()                \
+                << ", thread tensor type=" << thread_tensor->type();           \
+        exit(-1);                                                              \
+      }                                                                        \
+      MergeToRootScope<cpp_type>(root_tensor, thread_tensor);                  \
+    }                                                                          \
+  } while (0)
+      _ForEachDataType_(MergeCallback);
+    }
+  }
+  MergeDenseParam();
+
+#endif
   root_scope_->DropKids();
 }
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 472c6f408266af..4c529329761227 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -304,6 +304,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                               customized_name,                     \
                               customized_type_value,               \
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 834cdb422ad000..e624fbc237dea8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -208,6 +208,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
       platform::SetXPUDeviceId(dev_id);
+#endif
+    } else if (platform::is_npu_place(place)) {
+#ifndef PADDLE_WITH_ASCEND_CL
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with NPU support.",
+          place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+      platform::SetNPUDeviceId(dev_id);
 #endif
     }
 
@@ -1248,6 +1258,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (kernel_iter == kernels.end() &&
+      is_npu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing NPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                     platform::errors::NotFound(
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index bf27a8e37e0b32..3fc61581eca720 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -419,6 +419,7 @@ class ExecutionContext {
   const RuntimeContext Context() const { return ctx_; }
 
   std::string DebugString() const { return op_.DebugString(); }
+  const OperatorBase& GetOp() const { return op_; }
 
  private:
   const OperatorBase& op_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 2f280d5cc4ae0b..dd93639f319083 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -625,6 +625,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    const BuildStrategy &build_strategy,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "NPU is not supported in ParallelExecutor"));
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index a97fc2e75aab14..5968df548dfb0f 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -71,37 +71,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
                                      const ProgramDesc& program,
                                      const platform::Place& place) {
   auto& global_block = program.Block(0);
-  std::map<std::string, int> param_map;
-  for (auto& var : global_block.AllVars()) {
-    if (var->Persistable()) {
-      param_map[var->Name()] = 1;
-    }
-  }
 
   for (auto& var : global_block.AllVars()) {
-    bool is_param_grad = false;
-    size_t pos = 0;
-    // A magic suffix to indicate the merged gradient
-    std::string magicSuffix = std::string(kGradVarSuffix) + "@MERGED";
-    if ((pos = var->Name().find(magicSuffix)) != std::string::npos) {
-      auto prefix_name = var->Name().substr(0, pos);
-      if (param_map.find(prefix_name) != param_map.end()) {
-        is_param_grad = true;
-      }
-    }
     if (var->Persistable() && microbatch_id == 0) {
       auto* ptr = root_scope_->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create persistable var: " << var->Name()
-              << ", which pointer is " << ptr;
-    } else if (is_param_grad && microbatch_id == 0) {
-      auto* ptr = minibatch_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create grad for persistable var: " << var->Name()
+      VLOG(5) << "Create persistable var: " << var->Name()
               << ", which pointer is " << ptr;
-    } else if (!var->Persistable() && !is_param_grad) {
+    } else if (!var->Persistable()) {
       auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
-      VLOG(3) << "Create variable " << var->Name() << " for microbatch "
+      VLOG(5) << "Create variable " << var->Name() << " for microbatch "
               << microbatch_id << ", which pointer is " << ptr;
       InitializeVariable(ptr, var->GetType());
     }
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index e77932fa5f2265..39bc3f040639bf 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -19,10 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_context.h"
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
     (defined PADDLE_WITH_PSLIB)
@@ -64,7 +60,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
   pull_dense_worker_ = PullDenseWorker::GetInstance();
   pull_dense_worker_->Initialize(trainer_desc);
   SetDebug(trainer_desc.debug());
-  fleet_ptr_ = FleetWrapper::GetInstance();
   trainer_desc_ = trainer_desc;
   workers_.resize(place_num);
   for (int i = 0; i < place_num; ++i) {
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index 2597901d91f36b..d178c4e556ca57 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/fleet/heter_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/string_helper.h"
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 54f77981306336..101463756c0a51 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -125,25 +125,54 @@ TEST(Tensor, MutableData) {
     float* p2 = nullptr;
     // initialization
     p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
     auto p1_holder = src_tensor.Holder();
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
     p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
     auto p2_holder = src_tensor.Holder();
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1_holder.get(), p2_holder.get());
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
     p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
     p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
+    EXPECT_EQ(p1, p2);
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::NPUPlace(0));
+    auto p1_holder = src_tensor.Holder();
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
+                                        platform::NPUPlace(0));
+    auto p2_holder = src_tensor.Holder();
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1_holder.get(), p2_holder.get());
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::NPUPlace(0));
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::NPUPlace(0));
     EXPECT_EQ(p1, p2);
   }
 #endif
@@ -179,7 +208,17 @@ TEST(Tensor, ShareDataWith) {
     framework::Tensor src_tensor;
     framework::Tensor dst_tensor;
     src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
-                                 platform::CUDAPlace());
+                                 platform::CUDAPlace(0));
+    dst_tensor.ShareDataWith(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::NPUPlace(0));
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
@@ -216,7 +255,34 @@ TEST(Tensor, Slice) {
   {
     framework::Tensor src_tensor;
     src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
-                                    platform::CUDAPlace());
+                                    platform::CUDAPlace(0));
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace(0)));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace(0)));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
+                                    platform::NPUPlace(0));
     framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
     framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
@@ -227,12 +293,12 @@ TEST(Tensor, Slice) {
         reinterpret_cast<uintptr_t>(src_tensor.data<double>());
     uintptr_t src_mutable_data_address =
         reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
-            src_tensor.dims(), platform::CUDAPlace()));
+            src_tensor.dims(), platform::NPUPlace(0)));
     uintptr_t slice_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
     uintptr_t slice_mutable_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
-            slice_tensor.dims(), platform::CUDAPlace()));
+            slice_tensor.dims(), platform::NPUPlace(0)));
     EXPECT_EQ(src_data_address, src_mutable_data_address);
     EXPECT_EQ(slice_data_address, slice_mutable_data_address);
     EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index c6ac30a369859d..d6882b25d22588 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -97,6 +97,42 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  // TODO(zhiqiu): handle different condition like CUDA code below
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
@@ -304,6 +340,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {  /* npu -> cpu*/
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {  /* cpu -> npu*/
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {  /* npu -> npu*/
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
@@ -431,6 +496,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
     return GetResultHelper(out, gpu);
   }
 
+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPlace& npu) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Not supported on place (%s) ", npu));
+    // return GetResultHelper(out, npu);
+  }
+
   bool GetResult(const framework::Tensor& out,
                  const platform::CPUPlace& cpu) const {
     return *out.data<bool>();
@@ -633,6 +705,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
 #endif
   }
 
+  void VisitorImpl(const platform::NPUPlace& npu) const {
+    // TODO(zhiqiu)
+  }
+
   void VisitorImpl(const platform::CPUPlace& cpu) const {
     int num = in_.numel();
     const bool* in_ptr = in_.data<bool>();
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index fd0f98784ceb0a..85af9e50087024 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -135,6 +135,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
   }
 #endif
 }
+
 template <typename T>
 void TensorFromVector(const std::vector<T>& src,
                       const platform::DeviceContext& ctx, Tensor* dst) {
@@ -157,6 +158,57 @@ void TensorFromVector(const std::vector<T>& src,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
+        src_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+// The fully specialized function should be inline to avoid
+// multi-definition.
+template <>
+inline void TensorFromVector(const std::vector<bool>& src,
+                             const platform::DeviceContext& ctx, Tensor* dst) {
+  // vector<bool> has no data() member, use array instead.
+  // See details:
+  // https://stackoverflow.com/questions/46115669/why-does-stdvectorbool-have-no-data/46115714
+  bool* array = new bool[src.size()];
+  for (unsigned int i = 0; i < src.size(); i++) {
+    array[i] = static_cast<bool>(src[i]);
+  }
+
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(array);
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<bool>(dst_place));
+  auto size = src.size() * sizeof(bool);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
+        src_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+  }
+#endif
+  delete[] array;
 }
 
 template <typename T>
@@ -171,6 +223,23 @@ void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
   memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
 }
 
+template <>
+inline void TensorFromVector(const std::vector<bool>& src, Tensor* dst) {
+  bool* array = new bool[src.size()];
+  for (unsigned int i = 0; i < src.size(); i++) {
+    array[i] = static_cast<bool>(src[i]);
+  }
+  platform::CPUPlace dst_place = platform::CPUPlace();
+  auto src_ptr = static_cast<const void*>(array);
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<bool>(dst_place));
+  auto size = src.size() * sizeof(bool);
+
+  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  delete[] array;
+}
+
 template <typename T>
 void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
                     std::vector<T>* dst) {
@@ -194,6 +263,54 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+template <>
+inline void TensorToVector(const Tensor& src,
+                           const platform::DeviceContext& ctx,
+                           std::vector<bool>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<bool>());
+  auto size = src.numel() * sizeof(bool);
+
+  bool* array = new bool[src.numel()];
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(array);
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
+                 size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+  }
+#endif
+  for (unsigned int i = 0; i < src.numel(); i++) {
+    (*dst)[i] = static_cast<bool>(array[i]);
+  }
+  delete[] array;
 }
 
 template <typename T>
@@ -215,6 +332,32 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
                BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
 }
 
+template <>
+inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<bool>());
+  auto size = src.numel() * sizeof(bool);
+
+  bool* array = new bool[src.numel()];
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(array);
+
+  PADDLE_ENFORCE_EQ(
+      platform::is_cpu_place(src.place()), true,
+      platform::errors::InvalidArgument(
+          "The input tensor should be CPU device, but actually it is in %s.",
+          src.place()));
+
+  memory::Copy(dst_place, dst_ptr,
+               BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
+
+  for (unsigned int i = 0; i < src.numel(); i++) {
+    (*dst)[i] = static_cast<bool>(array[i]);
+  }
+  delete[] array;
+}
+
 std::ostream& operator<<(std::ostream& os, const Tensor& t);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index c32efd0a470be2..8587ee8d1e9196 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -242,6 +242,61 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
+TEST(TensorToVector, Tensor_bool) {
+  {
+    paddle::framework::Tensor src;
+    bool* src_ptr =
+        src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = static_cast<bool>(i % 2);
+    }
+
+    paddle::platform::CPUPlace place;
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(src, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<bool> src_vec = {
+        false, true, false, true, false, true, false, true, false,
+    };
+    paddle::framework::Tensor gpu_tensor;
+    paddle::platform::CUDAPlace place;
+    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    std::vector<bool> src_vec = {
+        false, true, false, true, false, true, false, true, false,
+    };
+    paddle::framework::Tensor npu_tensor;
+    paddle::platform::NPUPlace place(0);
+    paddle::platform::NPUDeviceContext npu_ctx(place);
+    paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
+
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
 TEST(TensorFromDLPack, Tensor) {
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index ca290a50b42fe0..7efb89ad7d9d9c 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -109,13 +109,22 @@ class MultiTrainer : public TrainerBase {
   virtual Scope* GetWorkerScope(int thread_id);
   virtual std::string GetDumpPath(int tid);
 
+  template <typename T>
+  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
+#ifdef PADDLE_WITH_HETERPS
+
+  void MergeDenseParam();
+#endif
+
  protected:
   int thread_num_;
   std::vector<std::thread> threads_;
   std::vector<DataFeed*> readers_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<std::string> need_merge_var_names_;
-
+#ifdef PADDLE_WITH_HETERPS
+  std::vector<platform::Place> places_;
+#endif
   int mpi_rank_;
   int mpi_size_;
   int dump_file_num_;
@@ -313,7 +322,6 @@ class PSGPUTrainer : public TrainerBase {
   float scale_datanorm_;
   paddle::platform::Place place_;
   ProgramDesc program_;
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<platform::Place> places_;
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index a2b5a98401e236..e43cccfe648165 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -45,6 +45,17 @@ using Attribute = boost::variant<
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
+#ifdef PADDLE_WITH_ASCEND_CL
+using NPUAttribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>, std::vector<int64_t>,
+                   std::vector<double>, std::vector<std::vector<int64_t>>>;
+
+using NPUAttributeMap = std::unordered_map<std::string, NPUAttribute>;
+#endif
+
 using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 29ba54986801f1..d5350744e4c553 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient);
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, bool retain_graph) {
+void BasicEngine::Init(
+    const std::vector<std::shared_ptr<VarBase>>& tensors,
+    const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+    bool retain_graph) {
   retain_graph_ = retain_graph;
-  init_node_ = var->GradVarBase()->GradNode();
-  PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false,
-                    platform::errors::Unavailable(
-                        "%s trying to backward through the same graph a second "
-                        "time, but this graph have already been freed. Please "
-                        "specify Tensor.backward(retain_graph=True) when "
-                        "calling backward at the first time.",
-                        var->Name()));
-
-  if (!retain_graph) {
-    VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
-            << " because of retain_graph=False when calling backward";
-    var->GradVarBase()->SetGraphIsFreed(true);
-    var->GradVarBase()->ClearGradNode();
-  }
 
-  if (init_node_ == nullptr || var->OverridedStopGradient()) {
-    VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
-               "stop_gradient=True: "
-            << var->Name();
-    return;
-  }
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), grad_tensors.size(),
+      platform::errors::Unavailable(
+          "The size of tensors do not equal the size of grad_tensors,"
+          "the size of tensors is %s, but the size of grad_tensors is %s.",
+          tensors.size(), grad_tensors.size()));
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto var = tensors[i];
+    auto grad_tensor = grad_tensors[i];
+
+    auto init_node = var->GradVarBase()->GradNode();
+    PADDLE_ENFORCE_EQ(
+        var->GradVarBase()->GraphIsFreed(), false,
+        platform::errors::Unavailable(
+            "%s trying to backward through the same graph a second "
+            "time, but this graph have already been freed. Please "
+            "specify Tensor.backward(retain_graph=True) when "
+            "calling backward at the first time.",
+            var->Name()));
+
+    if (!retain_graph) {
+      VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
+              << " because of retain_graph=False when calling backward";
+      var->GradVarBase()->SetGraphIsFreed(true);
+      var->GradVarBase()->ClearGradNode();
+    }
+
+    if (init_node == nullptr || var->OverridedStopGradient()) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << var->Name();
+      continue;
+    }
 
-  VLOG(3) << "Init first node of backward";
+    VLOG(3) << "Init node of backward";
+
+    PADDLE_ENFORCE_EQ(
+        var->HasGradVar(), true,
+        platform::errors::NotFound("Tensor %s has no gradient", var->Name()));
+
+    auto& fwd_var = var->Var().Get<framework::LoDTensor>();
+    auto* grad_var =
+        var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
+    VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
+            << " as stop_gradient false";
+    var->GradVarBase()->InnerSetOverridedStopGradient(false);
+    auto* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(fwd_var.place());
+    if (grad_tensor == nullptr) {
+      grad_var->Resize(fwd_var.dims());
+      grad_var->mutable_data(fwd_var.place(), fwd_var.type());
+      operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    } else {
+      paddle::framework::TensorCopy(
+          grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
+          *dev_ctx, grad_var);
+    }
 
-  PADDLE_ENFORCE_EQ(
-      var->HasGradVar(), true,
-      platform::errors::NotFound("Grad variable not exist for variable %s",
-                                 var->Name()));
-
-  auto& fwd_var = var->Var().Get<framework::LoDTensor>();
-  auto* grad_var =
-      var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
-  VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
-          << " as stop_gradient false";
-  var->GradVarBase()->InnerSetOverridedStopGradient(false);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  grad_var->Resize(fwd_var.dims());
-  grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-  operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    init_nodes_.push_back(init_node);
+  }
 }
 
 void BasicEngine::CheckBackwardInputs(const OpBase& op) {
@@ -141,17 +166,6 @@ void BasicEngine::PrepareGradAccumulators(
                 << var.get()
                 << ") that don't have grad node  with reference count "
                 << accumulator->RefCnt();
-
-        if (var->HasLeafHooks()) {
-          VLOG(3) << "Grad variable wrapper (" << var->Name()
-                  << ") has leaf grad hooks.";
-          PADDLE_ENFORCE_NE(
-              var->HasGradNode(), true,
-              platform::errors::PermissionDenied(
-                  "Only leaf Tensor's gradient can append hook to "
-                  "Gradientaccumulator."));
-          accumulator->SetPostHooks(var->GetLeafHooks());
-        }
       } else {
         // Because Inplace op overwrites the grad_node of the input grad_var. So
         // only the information of grad_pending_node can be used to find the
@@ -235,8 +249,10 @@ void BasicEngine::PrepareDeps() {
   std::queue<GradOpNode*> q;
   std::unordered_set<GradOpNode*> visited;
 
-  q.push(init_node_.get());
-  visited.insert(init_node_.get());
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(init_nodes_[i].get());
+    visited.insert(init_nodes_[i].get());
+  }
 
   while (!q.empty()) {
     auto* cur_node = q.front();
@@ -262,15 +278,41 @@ void BasicEngine::PrepareDeps() {
   }
 }
 
+static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
+    const NameVarMap<VariableWrapper>& bwd_ins, const std::string& op_type) {
+  std::shared_ptr<NameVarMap<VariableWrapper>> tmp_ins_ptr = nullptr;
+  for (const auto& pair : bwd_ins) {
+    for (size_t i = 0; i < pair.second.size(); ++i) {
+      auto& var = pair.second[i];
+      if (var->HasVariableWrapperHook()) {
+        if (tmp_ins_ptr == nullptr) {
+          tmp_ins_ptr = std::make_shared<NameVarMap<VariableWrapper>>(bwd_ins);
+        }
+        VLOG(3) << "Call " << var->GetVariableWrapperHooks().size()
+                << " hooks of " << op_type << "'s input `" << pair.first
+                << "`'s var `" << var->Name() << "`.";
+        auto tmp_var = var;
+        for (const auto& hook_pair : var->GetVariableWrapperHooks()) {
+          tmp_var = (*hook_pair.second)(tmp_var);
+        }
+        (*tmp_ins_ptr)[pair.first][i] = tmp_var;
+      }
+    }
+  }
+  return tmp_ins_ptr;
+}
+
 void BasicEngine::Execute() {
-  if (init_node_ == nullptr) {
+  if (init_nodes_.empty()) {
     return;
   }
 
   PrepareDeps();
   // Start execute Computation graph
   std::queue<std::shared_ptr<GradOpNode>> q;
-  q.push(std::move(init_node_));
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(std::move(init_nodes_[i]));
+  }
 
   size_t op_num = 0;
 
@@ -292,10 +334,15 @@ void BasicEngine::Execute() {
       auto& bwd_ins = cur_op.GetInsMap();
       auto& bwd_outs = cur_op.GetOutsMap();
 
+      /**
+       * [ Why need temporary outputs here? ]
+       *
+       * - construct the temp output map, avoid to disrupt graph
+       * - replace the element in the map by temp var, because a
+       *   var may be coresponding to several grad var in one op
+       */
       NameVarMap<VariableWrapper> tmp_outs(bwd_outs);
-      // 1. construct the temp output map, avoid to disrupt graph
-      // 2. replace the element in the map by temp var, because a
-      // var may be coresponding to several grad var in one op
+
       for (auto& pair : tmp_outs) {
         if (!pair.second.IsGrad()) {
           continue;
@@ -408,10 +455,28 @@ void BasicEngine::Execute() {
         }
       }
 
+      /**
+       * [ Why need temporary inputs here? ]
+       *
+       * - Hook execution should not change original input tensor.
+       *   User can register hook for Tensor's gradient, It is expected
+       *   that the hook only affects the gradient of the backward
+       *   propagation, and does not affect the gradient value input
+       *   as the hook.
+       * - use `tmp_ins_ptr`, only copy bwd_ins when the var in bwd_ins
+       *   hold hooks
+       */
+      auto tmp_ins_ptr = CallGradientHooks(bwd_ins, cur_op.Type());
+
       {
         VLOG(3) << "Start to execute grad op " << cur_op.Type();
-        OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
-                    cur_op.place());
+        if (tmp_ins_ptr == nullptr) {
+          OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
+                      cur_op.place());
+        } else {
+          OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(),
+                      cur_op.place());
+        }
       }
 
       for (auto& pair : inplace_output_grad_var_list_) {
@@ -428,15 +493,14 @@ void BasicEngine::Execute() {
         if (!accumulator->SumGradCompleted()) {
           continue;
         }
-        // 1. Call Hooks for **inner_var_**
+        // 1. Call Hooks for `inner_var_`
+        accumulator->CallGradientHooks();
 
-        // 2. Sum Gradient with Previous Graph
+        // 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph
         accumulator->AccumulateGrad();
 
-        // 3. Call backward Hooks for **var_**
-        if (accumulator->HasPostHooks()) {
-          accumulator->CallBackwardPostHooks();
-        }
+        // 3. Call backward Hooks for `var_`
+        accumulator->CallReduceHooks();
       }
 
       need_accu_var_list_.clear();
@@ -470,7 +534,7 @@ void BasicEngine::Execute() {
 }
 
 void BasicEngine::Clear() {
-  init_node_.reset();
+  init_nodes_.clear();
   node_deps_.clear();
   accumulators_.clear();
   accumulators_with_grad_node_.clear();
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index a2ad8b5f8aa61e..49761a8df0b6b1 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -30,7 +30,9 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, bool retain_graph = false);
+  void Init(const std::vector<std::shared_ptr<VarBase>>& tensors,
+            const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+            bool retain_graph = false);
 
   void Execute() override;
 
@@ -46,7 +48,7 @@ class BasicEngine : public Engine {
   void Clear();
 
  private:
-  std::shared_ptr<GradOpNode> init_node_;
+  std::vector<std::shared_ptr<GradOpNode>> init_nodes_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   // The input and output of Inplace op are the same. If only `var` is used
   // as the key, then the input and output of inplace op must be gradient
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 873068a0d310dc..16f9454e9376e4 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -19,12 +19,11 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/bkcl_helper.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-
-#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -77,7 +76,7 @@ void BKCLParallelContext::Init() {
   bkcl_ids.resize(strategy_.nrings_);
 
   if (strategy_.local_rank_ == 0) {
-    // generate the unique ncclid on the root worker
+    // generate the unique bkclid on the root worker
     for (size_t i = 0; i < bkcl_ids.size(); ++i) {
       auto ret = bkcl_get_unique_id(&bkcl_ids[i]);
       PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
@@ -99,6 +98,28 @@ void BKCLParallelContext::Init() {
   }
 }
 
+void BKCLParallelContext::InitWithRingID(int ring_id) {
+  std::vector<BKCLUniqueId> bkcl_ids;
+  bkcl_ids.resize(1);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique bkclid on the root worker
+    auto ret = bkcl_get_unique_id(&bkcl_ids[0]);
+    PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
+                      platform::errors::PreconditionNotMet(
+                          "BKCL get unique id failed [%d]", ret));
+  }
+  BcastBKCLId(bkcl_ids, 0);
+
+  int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+  VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
+          << " ring id: " << ring_id;
+  // it will assign bkcl_comm in XPUDeviceContext within ring_id
+  platform::BKCLCommContext::Instance().CreateBKCLComm(
+      &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id);
+}
+
 void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
                                             framework::Variable *dst,
                                             int ring_id, bool use_calc_stream) {
@@ -146,8 +167,6 @@ void BKCLParallelContext::WaitCompute(int ring_id) {
       platform::errors::OutOfRange("Ring id expected < nrings,"
                                    "but got ring id = %d, nrings = %d",
                                    ring_id, strategy_.nrings_));
-  // TODO(wangxi16): [Performance optimize] Maybe need to put Wait and
-  // bkcl_allreduce to comm thread, for bkcl_allreduce is blocking now.
   auto compute_dev_ctx = static_cast<platform::XPUDeviceContext *>(
       platform::DeviceContextPool::Instance().Get(place_));
   compute_dev_ctx->Wait();
@@ -167,6 +186,12 @@ void BKCLParallelContext::WaitComm(int ring_id) {
   comm_dev_ctx->Wait();
 }
 
+void BKCLParallelContext::SynchronizeCompute() {
+  auto compute_dev_ctx = static_cast<platform::XPUDeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  compute_dev_ctx->Wait();
+}
+
 }  //  namespace imperative
 }  //  namespace paddle
 #endif
diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h
index d7d917f20082ac..652b7689666c6c 100644
--- a/paddle/fluid/imperative/bkcl_context.h
+++ b/paddle/fluid/imperative/bkcl_context.h
@@ -36,6 +36,8 @@ class BKCLParallelContext : public ParallelContext {
 
   void Init() override;
 
+  void InitWithRingID(int ring_id) override;
+
   void AllReduceByStream(const framework::Variable& src,
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
@@ -45,6 +47,8 @@ class BKCLParallelContext : public ParallelContext {
   void WaitCompute(int ring_id) override;
 
   void WaitComm(int ring_id) override;
+
+  void SynchronizeCompute() override;
 };
 
 }  //  namespace imperative
diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h
index a3678404728275..7fefc9ccc67b52 100644
--- a/paddle/fluid/imperative/dygraph_grad_maker.h
+++ b/paddle/fluid/imperative/dygraph_grad_maker.h
@@ -279,6 +279,8 @@ class TracedGradOp {
 
   void SetType(const std::string& type) { op_->SetType(type); }
 
+  const framework::OperatorBase& InnerOp() const { return op_->InnerOp(); }
+
   void SetAttrMap(const framework::AttributeMap& attrs) {
     return op_->SetAttrMap(attrs);
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index b9df88b1f1eeaa..43546cf99c69ff 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -115,6 +115,23 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  void operator()(const platform::NPUPlace& place) {
+    // TODO(zhiqiu): SUPPORT it
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const platform::NPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
   // there is NO blas in CUDAPinnedPlace
   void operator()(const platform::CUDAPinnedPlace& place) {
     PADDLE_THROW(platform::errors::PermissionDenied(
@@ -384,8 +401,8 @@ static platform::Place GetPlaceOfVar(
 
 void GradientAccumulator::AccumulateGrad() {
   /**
-   * If the gradient has been calculated by previous graph,
-   * it should be added to the previous graph result.
+   * If the leaf gradient has been calculated done, the inner_var_
+   * should be added to the var_.
    */
   if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) {
     return;
@@ -396,7 +413,7 @@ void GradientAccumulator::AccumulateGrad() {
                         "this auto-grad"));
   PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true,
                     platform::errors::InvalidArgument(
-                        "Interior var of Leaf tensor  should be initialized."));
+                        "Interior var of Leaf tensor should be initialized."));
   auto* src = inner_var_->MutableVar();
   auto* dst = var_->MutableVar();
   if (!var_->IsEmpty()) {
@@ -427,10 +444,65 @@ void GradientAccumulator::AccumulateGrad() {
     *(dst) = std::move(*src);
     var_->SetType(inner_var_->Type());
     var_->SetDataType(inner_var_->DataType());
+    var_->SetIsEmpty(false);
   }
   inner_var_.reset();
 }
 
+void GradientAccumulator::CallGradientHooks() {
+  PADDLE_ENFORCE_EQ(var_->IsLeafGrad(), true,
+                    platform::errors::Unavailable(
+                        "Only leaf gradient Tensor can deal with by gradient "
+                        "hook in gradient accumulator."));
+  PADDLE_ENFORCE_EQ(
+      SumGradCompleted(), true,
+      platform::errors::PreconditionNotMet(
+          "Only can call gradient hooks after sum gradient completed."));
+  PADDLE_ENFORCE_EQ(
+      HasInnerVar(), true,
+      platform::errors::PreconditionNotMet(
+          "Leaf Tensor's inner var is nullptr when call gradient hook."));
+  PADDLE_ENFORCE_EQ(
+      inner_var_->Var().IsInitialized(), true,
+      platform::errors::PreconditionNotMet("Leaf Tensor's inner var "
+                                           "is not initialized when "
+                                           "call gradient hook."));
+  if (var_->HasVariableWrapperHook()) {
+    VLOG(3) << "Call " << var_->GetVariableWrapperHooks().size()
+            << " hooks of leaf gradient accumulator's inner var `"
+            << var_->Name() << "`.";
+    auto tmp_var = inner_var_;
+    VLOG(3) << "Input var " << var_->Name() << "'s hook size - "
+            << var_->GetVariableWrapperHooks().size();
+    for (const auto& hook_pair : var_->GetVariableWrapperHooks()) {
+      tmp_var = (*hook_pair.second)(tmp_var);
+    }
+    inner_var_ = tmp_var;
+  }
+}
+
+void GradientAccumulator::CallReduceHooks() {
+  PADDLE_ENFORCE_EQ(
+      var_->IsLeafGrad(), true,
+      platform::errors::Unavailable("Only leaf gradient Tensor can deal with "
+                                    "by reduce hook in gradient accumulator."));
+  PADDLE_ENFORCE_EQ(SumGradCompleted(), true,
+                    platform::errors::PreconditionNotMet(
+                        "Only can call reduce hooks after the gradient "
+                        "summation is completed in current batch."));
+  PADDLE_ENFORCE_EQ(HasInnerVar(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Only can call reduce hooks after the "
+                        "gradient accumulation is completed in "
+                        "current batch or across batchs."));
+  if (var_->HasVoidHook()) {
+    for (const auto& hook : var_->GetVoidHooks()) {
+      VLOG(3) << "call gradient accumulator backward hooks.";
+      (*hook)();
+    }
+  }
+}
+
 void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                                        size_t trace_id, bool unchange_input) {
   /**
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index e2dabc06a7dae6..6411dce4405c11 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -40,8 +40,8 @@ class GradientAccumulator {
     }
 
     // inner_var_ record the grad of this auto-grad.
-    // Only need to generate inner var for non-empty leaf-tensor.
-    if (var->IsLeafGrad() && !var->IsEmpty()) {
+    // Only need to generate inner var for leaf-tensor.
+    if (var->IsLeafGrad()) {
       inner_var_ = std::make_shared<VariableWrapper>(var->Name());
       inner_var_->SetType(var->Type());
       inner_var_->SetDataType(var->DataType());
@@ -52,9 +52,6 @@ class GradientAccumulator {
               << ") to store result of this Graph";
     }
 
-    // TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag
-    var->SetIsEmpty(false);
-
     // var_ is the final grad, processed by hooks and grad accumulation
     var_ = var;
   }
@@ -93,42 +90,38 @@ class GradientAccumulator {
 
   inline bool HasInnerVar() const { return inner_var_ != nullptr; }
 
-  /* Hook related methods */
-  inline bool HasPostHooks() const { return !post_hooks_.expired(); }
-
-  void SetPostHooks(const std::shared_ptr<LeafVarHookPipeline>& hooks) {
-    PADDLE_ENFORCE_NOT_NULL(
-        hooks, platform::errors::InvalidArgument(
-                   "The hook set to GradientAccumulator is nullptr."));
-
-    auto shared_hooks = post_hooks_.lock();
-    if (shared_hooks != hooks) {
-      PADDLE_ENFORCE_EQ(
-          shared_hooks, nullptr,
-          platform::errors::PermissionDenied(
-              "Cannot set post hooks twice to GradientAccumulator."));
-      post_hooks_ = hooks;
-    }
-  }
-  // void CallHooks(){}
-  //  ** inner_var_ **
-
   // function that Sum Gradient with Previous Graph
   void AccumulateGrad();
 
-  // call backward post hooks, such as reduce hook
-  void CallBackwardPostHooks() {
-    PADDLE_ENFORCE_NE(
-        post_hooks_.expired(), true,
-        platform::errors::NotFound(
-            "The post hooks of GradientAccumulator for Tensor `%s` expired.",
-            var_->Name()));
-    auto shared_hooks = post_hooks_.lock();
-    for (const auto& hook : shared_hooks->backward_hooks()) {
-      VLOG(3) << "call gradient accumulator backward hooks.";
-      (*hook)(var_);
-    }
-  }
+  /** [ Hook related methods ]
+   *
+   *  [Why need two types of VariableWrapperHook? ]
+   *
+   *    There are two types of gradient accumulation:
+   *    1. Gradient accumulation in same batch
+   *    2. Gradient accumulation across batchs
+   *    The order of execution between Hooks and gradient accumulation:
+
+   *      [ Gradient accumulation in same batch]
+   *                        |
+   *            [ leaf GradVarBase hooks ]
+   *                        |
+   *      [ Gradient accumulation across batchs ]
+   *                        |
+   *          [ Gradient reduce / allreduce hooks ]
+
+   *    Because we currently intend to accumulate these two gradient
+   *    accumulation in one GradientAccumulator, We must distinguish between
+   *    two types of hooks.
+
+   *    And the InplaceVariableWrapperHook does not allow users to register
+   *    directly, and is currently only used to support the reduce strategy of
+   *    parallel multi-card training.
+   */
+
+  void CallGradientHooks();
+
+  void CallReduceHooks();
 
  protected:
   VariableWrapper* var_;
@@ -137,7 +130,6 @@ class GradientAccumulator {
   std::shared_ptr<VariableWrapper> inner_var_;
   size_t ref_cnt_{0};
   size_t cur_cnt_{0};
-  std::weak_ptr<LeafVarHookPipeline> post_hooks_;
 };
 
 class EagerGradientAccumulator : public GradientAccumulator {
diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h
index 1211ec6ae6c7bd..fa929b7c7a51c7 100644
--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
@@ -18,215 +18,63 @@
 #include <memory>
 #include <utility>
 #include <vector>
-
-#include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/fluid/platform/macros.h"
-
 namespace paddle {
 namespace imperative {
 
 class VariableWrapper;
 
-/** [ Basic hook classes ]
- * s
- * @brief OpBasePreHook is executed before the grad OpBase is executed,
- *        taking the input of the current grad OpBase as input, and
- *        executing python hooks (user-defined) or C++ hooks (developer-defined)
- *        to achieve the purpose of custom operations on the interior VarBase
- *        gradient.
+/** [ VariableWrapper Hook ]
  *
- * @note  OpBasePreHook will not change the input gradient VarBase.
+ * @brief This hook functor is executed before the grad OpBase is executed or
+ *        after gradient accumulation completed in current batch.
+ *        1. For interior var, VariableWrapper Hook take the input of the
+ *        current grad OpBase as input.
+ *        2. For leaf var, VariableWrapper Hook take the inner_var_ of
+ *        GradientAccumulator as input.
  *
- * @note  [Why need to be OpBase `PreHook`, why not `PostHook`?]
+ * @note  This hook functor will not change the input gradient VariableWrapper,
+ *        but if you copy the input VariableWrapper and change the value of
+ *        Variable in VariableWrapper, the value of input will also be changed,
+ *        because they shared same PlaceHolder.
  *
- *        If set OpBase post hook, when the op executed end, the op's output
- *        gradient may not be the final state, because it may need other op's
- *        gradient output to accumulated to it. But before op can be executed,
- *        the gradient output must have been accumulated to final value.
+ * @note  [ Why need to be OpBase `PreHook`, why not `PostHook`? ]
  *
- * @note  [Why only can be used for interior VarBase?]
+ *        We expect If set OpBase post hook, when the op executed end, the
+ *        op's output gradient may not be the final state, because it may need
+ *        other op's gradient output to accumulated to it. But before op can
+ *        be executed, the gradient output must have been accumulated to final
+ *        value.
+ *
+ * @note  [ Why Leaf gradient is special? ]
  *
  *        Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf
  *        GradVarBase has no next OpBase to executed, so if need to deal with
- *        the leaf GradVarBase, cannot use OpBasePreHook. For this case, we
- *        deal with by GradAccumulatorPostHook.
+ *        the leaf GradVarBase, we should call hooks after gradient accumulation
+ *        completed.
  */
-class OpBasePreHook {
+class VariableWrapperHook {
  public:
-  virtual ~OpBasePreHook() = default;
-  virtual VariableWrapperList operator()(
-      const VariableWrapperList& grad_inputs) = 0;
+  virtual ~VariableWrapperHook() = default;
+  virtual std::shared_ptr<VariableWrapper> operator()(
+      const std::shared_ptr<VariableWrapper>& var) = 0;
 };
 
-/**
- * @brief GradAccumulatorPostHook is the Hook that operates on the current
- *        gradientafter the GradientAccumulator has accumulated the gradient.
- *        Leaf GradVarBase has no next OpBase, if we want to register hook
- *        for it, we also need to wait until the leaf GradVarBase accumulation
- *        is completed, so we can add post hook to GradientAccumulator.
- *
- * @note  GradAccumulatorPostHook will change the grad VarBase value.
- *
- * @note  Only allow leaf VarBase hold GradientAccumulatorPostHook.
- */
-class GradAccumulatorPostHook {
- public:
-  virtual ~GradAccumulatorPostHook() = default;
-  virtual void operator()(VariableWrapper* var) = 0;
-};
-
-/** [ Hook for cpp functions ]
- *
- * Here we design three C++ hooks；
- * 1. CppOpBasePreHook (Implement later):
- *    - used for developer-defined C++ interior VarBase hooks
- * 2. CppGradAccumulatorPostHook (Implement later):
- *    - used for developer-defined C++ leaf VarBase hooks
- * 3. LambdaGradAccumulatorPostHook:
- *    - used for VarBase reduce in parallel training
- *
- * @note  [Why need two types of GradAccumulatorPostHook? ]
- *
- *        There are two types of gradient accumulation:
- *        1. Gradient accumulation in same batch
- *        2. Gradient accumulation across batchs
- *        The order of execution between Hooks and gradient accumulation:
- *
- *          [ Gradient accumulation in same batch]
- *                            |
- *                [ leaf GradVarBase hooks ]
- *                            |
- *          [ Gradient accumulation across batchs ]
- *                            |
- *              [ Gradient reduce / allreduce]
- *
- *        Because we currently intend to accumulate these two gradient
- *        accumulation in one GradientAccumulator, We must distinguish between
- *        two types of hooks.
- *
- *        And the LambdaGradAccumulatorPostHook does not allow users to register
- *        directly, and is currently only used to support the reduce strategy of
- *        parallel multi-card training.
- */
-class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook {
+class CppVariableWrapperHook : public VariableWrapperHook {
  public:
-  explicit LambdaGradAccumulatorPostHook(
-      std::function<void(VariableWrapper*)> fn)
+  explicit CppVariableWrapperHook(
+      std::function<std::shared_ptr<VariableWrapper>(
+          const std::shared_ptr<VariableWrapper>&)>&& fn)
       : fn_(std::move(fn)) {}
 
-  void operator()(VariableWrapper* var) override { fn_(var); }
-
- private:
-  std::function<void(VariableWrapper*)> fn_;
-};
-
-/* Hooks for python function: in pybind/imperative.cc */
-
-/** Add Python Hooks later:
- * - PyOpBasePreHook (Implement later): used for user-defined interior python
- * VarBase hooks
- * - PyGradAccumulatorPostHook (Implement later): used for user-defined leaf
- * python VarBase hooks
- */
-
-/** [ Hook Pipeline classes ]
- *
- * @note  [Why need hook pipeline classes?]
- *
- *        There are 2 purposes for adding Hook pipeline here:
- *
- *        1. Make the code implementation cleaner.
- *
- *          If there are no Hook pipeline, we need to add 3 hook vector into
- *          VariableWrapper, 1 hook vector into OpBase, 2 hook vector into
- *          GradientAccumulator, like:
- *
- *          - VariableWrapper:
- *            std::vector<std::shared_ptr<OpBasePreHook>>
- *              interior_var_hooks_;
- *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
- *              leaf_var_hooks_;
- *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
- *              backward_hooks_;
- *
- *          - OpBase:
- *            std::vector<std::weak_ptr<OpBasePreHook>>
- *              interior_var_hooks_;
- *
- *          - GradientAccumulator:
- *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
- *              leaf_var_hooks_;
- *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
- *              backward_hooks_;
- *
- *          This seems more complicated, and std::vector<std::weak_ptr<...>>
- *          is not easy to destruct.
- *
- *        2. Make the code easier to understand.
- *
- *          From these two packages, we can clearly understand that we
- *          have two types of Hooks, respectively for the interior
- *          gradient var and leaf gradient var inside the backward
- *          calculation graph.
- */
-
-class InteriorVarHookPipeline {
- public:
-  InteriorVarHookPipeline() = default;
-
-  void add_hook(std::unique_ptr<OpBasePreHook>&& hook) {
-    hooks_.emplace_back(std::move(hook));
+  std::shared_ptr<VariableWrapper> operator()(
+      const std::shared_ptr<VariableWrapper>& var) override {
+    return fn_(var);
   }
 
-  const std::vector<std::unique_ptr<OpBasePreHook>>& hooks() const {
-    return hooks_;
-  }
-
-  std::vector<std::unique_ptr<OpBasePreHook>>& hooks() { return hooks_; }
-
  private:
-  std::vector<std::unique_ptr<OpBasePreHook>> hooks_;
-
-  DISABLE_COPY_AND_ASSIGN(InteriorVarHookPipeline);
-};
-
-class LeafVarHookPipeline {
- public:
-  LeafVarHookPipeline() = default;
-
-  void add_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() const {
-    return hooks_;
-  }
-
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() {
-    return hooks_;
-  }
-
-  void add_backward_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    backward_hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks()
-      const {
-    return backward_hooks_;
-  }
-
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks() {
-    return backward_hooks_;
-  }
-
- private:
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>> hooks_;
-  // NOTE: the `backward` here means the `whole backward process`,
-  // the `backward_hooks_` need to be executed after the `whole backward
-  // process`.
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>> backward_hooks_;
-
-  DISABLE_COPY_AND_ASSIGN(LeafVarHookPipeline);
+  std::function<std::shared_ptr<VariableWrapper>(
+      const std::shared_ptr<VariableWrapper>&)>
+      fn_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 062f04c6b7052f..70359dc3fd25bf 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -406,7 +406,7 @@ void OpBase::Run(const framework::OperatorBase& op,
   OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, place);
 }
 
-static void ClearNoNeedBufferInputs(OpBase* op) {
+void ClearNoNeedBufferInputs(OpBase* op) {
   auto& inferer = op->Info().NoNeedBufferVarsInferer();
   if (!inferer) return;
   auto* ins = op->GetMutableInsMap();
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index ff5a780a5f9dbf..bbede47e364298 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -30,6 +30,7 @@
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/flags.h"
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/saved_variable_wrapper_list.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
@@ -107,6 +108,10 @@ class VarBase {
 
   void ClearGradVarBase() { grad_var_ = nullptr; }
 
+  void SetGradVarBase(VarBase& grad_var) {
+    MutableGradVarBase()->CopyFrom(grad_var, true);
+  }
+
   const std::shared_ptr<VarBase>& MutableGradVarBase() {
     if (grad_var_ == nullptr) {
       if (auto grad_var_wrapper = var_->GetGradVar()) {
@@ -220,6 +225,28 @@ class VarBase {
 
   void BumpInplaceVersion();
 
+  /* Hook related method: now only used for GradVarBase */
+  bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); }
+
+  int64_t AddVariableWrapperHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    return var_->AddVariableWrapperHook(
+        std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
+  }
+
+  bool RemoveVariableWrapperHook(const int64_t& hook_id) {
+    return var_->RemoveVariableWrapperHook(hook_id);
+  }
+
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>&
+  GetVariableWrapperHooks() const {
+    return var_->GetVariableWrapperHooks();
+  }
+
+  void AddVoidHook(std::shared_ptr<std::function<void()>>&& hook) {
+    var_->AddVoidHook(
+        std::forward<std::shared_ptr<std::function<void()>>>(hook));
+  }
+
  private:
   /**
    * NOTE(zengjinle): never remove the const qualifier of `var_` if you are
@@ -259,5 +286,7 @@ std::shared_ptr<GradOpNode> CreateGradOpNode(
     const platform::Place& place,
     const std::map<std::string, std::string>& inplace_map);
 
+void ClearNoNeedBufferInputs(OpBase* op);
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index eb0135d15e0743..b91fc460781c79 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -79,6 +79,30 @@ void NCCLParallelContext::Init() {
   }
 }
 
+void NCCLParallelContext::InitWithRingID(int ring_id) {
+  std::vector<ncclUniqueId> nccl_ids;
+  nccl_ids.resize(1);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique ncclid on the root worker
+    platform::dynload::ncclGetUniqueId(&nccl_ids[0]);
+  }
+  BcastNCCLId(nccl_ids, 0);
+
+  int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
+          << " ring id: " << ring_id;
+  // it will assign nccl_comm in CUDADeviceContext within ring_id
+  platform::NCCLCommContext::Instance().CreateNCCLComm(
+      &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
+
+  compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
+      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+  comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
+      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+}
+
 void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
                                             framework::Variable *dst,
                                             int ring_id, bool use_calc_stream) {
@@ -149,6 +173,12 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 #endif
 }
 
+void NCCLParallelContext::SynchronizeCompute() {
+  auto *compute_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  compute_dev_ctx->Wait();
+}
+
 #endif
 
 }  //  namespace imperative
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 51e5743aebdc3d..bcaeb811b108c5 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -53,6 +53,8 @@ class NCCLParallelContext : public ParallelContext {
 
   void Init() override;
 
+  void InitWithRingID(int ring_id) override;
+
   void AllReduceByStream(const framework::Variable& src,
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
@@ -63,6 +65,8 @@ class NCCLParallelContext : public ParallelContext {
 
   void WaitComm(int ring_id) override;
 
+  void SynchronizeCompute() override;
+
  private:
   // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
   std::vector<std::shared_ptr<platform::CudaEventObject>> compute_events_;
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index 2b7642ae7cfd92..0164ff9313cdfe 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -177,8 +177,6 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
-
-  std::weak_ptr<InteriorVarHookPipeline> pre_hooks_;
 };
 
 class GradOpNode {
diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h
index ef0a9604092151..f537a316014d60 100644
--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
@@ -50,6 +50,8 @@ class ParallelContext {
 
   virtual void Init() = 0;
 
+  virtual void InitWithRingID(int ring_id) = 0;
+
   virtual void AllReduceByStream(const framework::Variable& src,
                                  framework::Variable* dst, int ring_id,
                                  bool use_calc_stream) = 0;
@@ -64,6 +66,9 @@ class ParallelContext {
   // if CPU, should do nothing.
   virtual void WaitComm(int ring_id) = 0;
 
+  // synchorize compute stream
+  virtual void SynchronizeCompute() = 0;
+
   inline int GetNRings() const { return strategy_.nrings_; }
 
   inline int64_t GetNRanks() const { return strategy_.nranks_; }
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 8dd8cafc835ab1..3da3a05ed1071c 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -369,6 +369,10 @@ class GradientAccumulationInfo {
     *is_finished = (cur_ref_cnt_ == total_ref_cnt_);
     accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input);
 
+    if (*is_finished && accumulator_->HasInnerVar()) {
+      accumulator_->AccumulateGrad();
+    }
+
     if (create_graph_) {
       VLOG(10) << "Store partial grad grad for double grad "
                << mapped_grad_var_->Name();
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
new file mode 100644
index 00000000000000..bd132f2576fec1
--- /dev/null
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -0,0 +1,172 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/operators/py_layer_op.h"
+
+namespace paddle {
+namespace imperative {
+
+namespace py = ::pybind11;
+
+bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
+  for (const auto& name_pair : ins) {
+    for (const auto& var_base : name_pair.second) {
+      if (!var_base->OverridedStopGradient()) {
+        PassStopGradient(outs, var_base->OverridedStopGradient());
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+std::shared_ptr<GradOpNode> CreateGradOpNode(
+    const std::string& type, const NameVarBaseMap& ins,
+    const NameVarBaseMap& outs, const framework::AttributeMap& attrs,
+    const platform::Place& place,
+    const std::map<std::string, std::string>& inplace_map,
+    const std::shared_ptr<operators::PyLayerContext>& py_context) {
+  operators::PyLayerGradOpMaker<paddle::imperative::OpBase> maker(
+      type, ins, outs, attrs, inplace_map);
+
+  maker.SetPyLayerContext(py_context);
+  auto grad_node = maker();
+  if (grad_node && !grad_node->empty()) {
+    for (auto& grad_op : *grad_node) {
+      grad_op.SetId(OpBase::GenerateUniqueId());
+      grad_op.SetPlace(place);
+      ClearNoNeedBufferInputs(&grad_op);
+    }
+    return grad_node;
+  } else {
+    return nullptr;
+  }
+}
+
+py::object PyLayerApply(const platform::Place& place, const py::object& cls,
+                        const py::args args, const py::kwargs kwargs) {
+  auto bk_function = cls.attr("_backward_function");
+  auto context = bk_function();
+  auto forward = cls.attr("forward");
+
+  auto result_forward = forward(context, *args, **kwargs);
+  std::shared_ptr<operators::PyLayerContext> py_layer_ctx =
+      std::make_shared<operators::PyLayerContext>(context.release().ptr());
+  // make inputs to varbase
+  std::vector<std::shared_ptr<imperative::VarBase>> input_vars;
+  // process args,`input_vars` only collect `imperative::VarBase`
+  if (!args.empty()) {
+    for (auto ptr = args.begin(); ptr != args.end(); ptr++) {
+      try {
+        if (Py_None != ptr->ptr()) {
+          auto a = ptr->cast<std::shared_ptr<VarBase>>();
+          input_vars.push_back(a);
+        }
+      } catch (py::cast_error& err) {
+        // Only collect Tensor type in 'args' and pass them to backward. Ignore
+        // other types of input temporarily.
+      }
+    }
+  }
+  // process kwargs, only collect `imperative::VarBase`
+  if (!kwargs.empty()) {
+    for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) {
+      try {
+        if (Py_None != ptr->second.ptr()) {
+          auto a = ptr->second.cast<std::shared_ptr<VarBase>>();
+          input_vars.push_back(a);
+        }
+      } catch (py::cast_error&) {
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
+      }
+    }
+  }
+  NameVarBaseMap ins = {{"X", input_vars}};
+
+  std::vector<std::shared_ptr<imperative::VarBase>> output_vars;
+  if (PyTuple_Check(result_forward.ptr()) ||
+      PyList_Check(result_forward.ptr())) {
+    auto tuple_result = result_forward.cast<py::tuple>();
+    for (size_t i = 0; i < tuple_result.size(); i++) {
+      if (Py_None != tuple_result[i].ptr()) {
+        try {
+          auto temp_out =
+              tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
+          output_vars.push_back(temp_out);
+        } catch (py::cast_error&) {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "The output of `PyLayer.forward` should be `Tensor`."));
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.forward` can not be `None`."));
+      }
+    }
+  } else {
+    if (Py_None != result_forward.ptr()) {
+      try {
+        auto temp_out =
+            result_forward.cast<std::shared_ptr<imperative::VarBase>>();
+        output_vars.push_back(temp_out);
+      } catch (py::cast_error&) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.forward` should be `Tensor`."));
+      }
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The output of `PyLayer.forward` can not be `None`."));
+    }
+  }
+
+  NameVarBaseMap outs = {{"Out", output_vars}};
+
+  if (RequiredGrad(ins, outs)) {
+    std::map<std::string, std::string> inplace_map{};
+    bool if_inplace = false;
+    for (auto temp_ins : input_vars) {
+      if (if_inplace) {
+        break;
+      }
+      for (auto temp_outs : output_vars) {
+        if (temp_ins->Name() == temp_outs->Name()) {
+          if_inplace = true;
+          break;
+        }
+      }
+    }
+    if (if_inplace) {
+      inplace_map["X"] = "Out";
+    }
+
+    CreateGradOpNode("py_layer", ins, outs, {{}}, place, inplace_map,
+                     py_layer_ctx);
+  } else {
+    VLOG(3) << "No Grad to track for Op: py_layer_op";
+  }
+
+  return result_forward;
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index e8b531d35cabfc..a92704ce447dc1 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -310,13 +310,16 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
   for (size_t global_var_index = 0; global_var_index < vars_.size();
        ++global_var_index) {
     auto var = vars_[global_var_index];
-    var->SharedVar()->AddGradVarLeafBackwardHook(
-        std::unique_ptr<LambdaGradAccumulatorPostHook>(
-            new LambdaGradAccumulatorPostHook([=](VariableWrapper *grad) {
-              this->AddDistHook(global_var_index);
-            })));
+    var->GradVarBase()->AddVoidHook(std::make_shared<std::function<void()>>(
+        [=]() { this->AddDistHook(global_var_index); }));
     var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index;
   }
+
+  // for checking var is ready once
+  vars_marked_ready_.resize(vars_.size(), false);
+
+  // Initialize local used vars
+  local_used_vars_.resize(vars_.size(), 0);
 }
 
 void Reducer::InitializeDenseGroups(
@@ -325,7 +328,7 @@ void Reducer::InitializeDenseGroups(
   for (size_t index = 0; index < variable_indices_.size(); ++index) {
     const auto variable_index = variable_indices_[index];
     const auto &var = vars_[variable_index];
-    const auto var_name = var->Name();
+    const auto &var_name = var->Name();
     PADDLE_ENFORCE_EQ(is_sparse_gradient_[variable_index], false,
                       platform::errors::PreconditionNotMet(
                           "Tensor %s's GRAD must be LoDTensor, but received "
@@ -336,7 +339,7 @@ void Reducer::InitializeDenseGroups(
     PADDLE_ENFORCE_EQ(lod_tensor->IsInitialized(), true,
                       platform::errors::PreconditionNotMet(
                           "Tensor %s is not initialized.", var_name));
-    auto size = lod_tensor->numel();
+    const auto size = lod_tensor->numel();
     PADDLE_ENFORCE_GT(
         size, 0, platform::errors::PreconditionNotMet(
                      "The number of tensor %s's elements is 0.", var_name));
@@ -348,8 +351,8 @@ void Reducer::InitializeDenseGroups(
     p_group->dense_tensors_.push_back(framework::Tensor());
 
     // check the dtype and place, it must be same.
-    auto dtype = var->DataType();
-    auto place = var->Place();
+    const auto &dtype = var->DataType();
+    const auto &place = var->Place();
     if (index > 0) {
       PADDLE_ENFORCE_EQ(
           dtype, p_group->dtype_,
@@ -419,8 +422,7 @@ void Reducer::InitializeGroups(
     group.variable_indices_ = std::move(variable_indices_);
     groups_.emplace_back(std::move(group));
     // Debug Message For Reducer
-    VLOG(3) << "The Group[" << group_index << "]:";
-    VLOG(3) << groups_.back();
+    VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
   }
 }
 
@@ -463,34 +465,38 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
 // and allreudce sequence counter(next_group_) will be cleaned up again.
 void Reducer::PrepareForBackward(
     const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
-  VLOG(3) << "start reseting count..";
+  VLOG(3) << "after forward, then reset count for backward.";
   next_group_ = 0;
   std::for_each(groups_.begin(), groups_.end(), [](Group &group) {
     group.pending_ = group.variable_indices_.size();
     group.sparse_contents_ = nullptr;
   });
 
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(vars_.size(), false);
+
   PADDLE_ENFORCE_EQ(
-      all_group_ready_, false,
+      groups_need_finalize_, false,
       platform::errors::PreconditionNotMet(
-          "Please note that all forward outputs derived from the module "
+          "A serious error has occurred here. There may be several reasons: "
+          "1) Please note that all forward outputs derived from the module "
           "parameters must participate in the calculation of losses and "
           "subsequent gradient calculations. If not, the wrapper will hang, "
           "waiting for autograd to generate gradients for these parameters. "
           "you can use detach or stop_gradient to make the unused parameters "
-          "detached from the autograd graph."));
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
 
   // The first var to trigger the unused parameter
   has_marked_unused_vars_ = false;
+  unused_vars_.clear();
+
   if (!find_unused_vars_) {
     return;
   }
 
-  // TODO(shenliang03) "find_unused_vars" interface will be exposed in the
-  // future to handle control flow to process unused parameters
-  find_unused_vars_ = false;
-
-  unused_vars_.clear();
   node_deps_.clear();
   std::queue<std::shared_ptr<GradOpNode>> q;
   std::unordered_set<VariableWrapper *> var_visited;
@@ -553,6 +559,23 @@ void Reducer::PrepareForBackward(
               << "] is not used";
     }
   }
+
+  if (unused_vars_.empty()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "All parameters are involved in the backward pass. "
+           "It is recommended to set find_unused_parameters to False "
+           "to improve performance. However, if unused parameters "
+           "appear in subsequent iterative training, then an error "
+           "will occur. Please make it clear that in the subsequent "
+           "training, there will be no parameters that are not used "
+           "in the backward pass, and then set find_unused_parameters";
+  } else if (unused_vars_.size() == vars_.size()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "There is no parameter in the device involved "
+           "in the backward calculation. If there are "
+           "parameters on other devices involved in the "
+           "backward, then a serious error will occur here.";
+  }
 }
 
 // Add hook function to each leaf node. When the gradient of a leaf node is
@@ -565,67 +588,133 @@ void Reducer::PrepareForBackward(
 // concat + allreduce + split is emitted in turn according to next_group_.
 // 3, FinalizeBackward: after the end, synchronize each stream.
 void Reducer::AddDistHook(size_t var_index) {
+  PADDLE_ENFORCE_LT(var_index, variable_locators_.size(),
+                    platform::errors::OutOfRange(
+                        "Out of bounds variable index. it must be less"
+                        "than %d, but it is %d",
+                        variable_locators_.size(), var_index));
+
   VLOG(3) << "Var[" << var_index << "] ["
           << vars_[var_index]->GradVarBase()->Name()
           << "] arrived and triggered disthook";
-  if (!has_marked_unused_vars_) {
-    has_marked_unused_vars_ = true;
-    for (auto unused_index : unused_vars_) {
-      if (NeedRebuildGroup()) {
-        rebuild_vars_.push_back(vars_[unused_index]);
-        rebuild_var_indices_.push_back(unused_index);
-      }
-      MarkVarReady(unused_index, false);
-    }
-  }
 
+  local_used_vars_[var_index] = 1;
+
+  // rebuild group when find_unused_vars_ is false
   if (NeedRebuildGroup()) {
     rebuild_vars_.push_back(vars_[var_index]);
     rebuild_var_indices_.push_back(var_index);
   }
+
+  if (!has_marked_unused_vars_ && find_unused_vars_) {
+    has_marked_unused_vars_ = true;
+    for (const auto &unused_index : unused_vars_) {
+      MarkVarReady(unused_index, false);
+    }
+  }
+
   MarkVarReady(var_index, true);
 }
 
 void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
-  all_group_ready_ = true;
+  groups_need_finalize_ = true;
+
   const auto &var_locator = variable_locators_[var_index];
-  auto group_index = var_locator.group_index;
+  const auto group_index = var_locator.group_index;
   auto &group = groups_[group_index];
 
+  // error happened, if the var is ready before.
+  if (vars_marked_ready_[var_index]) {
+    auto error_info = string::Sprintf(
+        "Error happened, when parameter[%d][%s] has been ready before. "
+        "There may be several reasons for this error: "
+        "1) In multiple reentrant backward phase, some parameters are reused."
+        "2) Using model parameters outside of forward function. Please "
+        "make sure that model parameters are not shared in concurrent "
+        "forward-backward passes.",
+        var_index, vars_[var_index]->GradVarBase()->Name());
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false,
+                      platform::errors::PreconditionNotMet(error_info));
+
+    error_info +=
+        "3) Unused parameters retrieval is incorrect. "
+        "The return value of forward will be used to retrieve"
+        " the unused parameters of the entire model. These "
+        "gradients of unused parameters will not be synchronized "
+        "between multiple cards. However, if the unused "
+        "parameters participate in the backward calculation "
+        "again at a later time (e.g. after the forward function, "
+        "the loss calculation uses the unused "
+        "paramters of the forward and trigger backward), "
+        "its gradient will be wrong.";
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true,
+                      platform::errors::PreconditionNotMet(error_info));
+  } else {
+    vars_marked_ready_[var_index] = true;
+  }
+
   if (!group.is_sparse_) {
     // process dense group
-    auto inside_group_index = var_locator.inside_group_index;
-    auto length = group.length_[inside_group_index];
+    const auto inside_group_index = var_locator.inside_group_index;
+    const auto length = group.length_[inside_group_index];
     auto &group_tensor = group.dense_tensors_[inside_group_index];
+
     if (is_used_var) {
-      auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar();
-      auto tensor =
-          var_warpper->MutableVar()->GetMutable<framework::LoDTensor>();
+      auto var_base = vars_[var_index]->GradVarBase();
+      auto tensor = var_base->MutableVar()->GetMutable<framework::LoDTensor>();
       group_tensor.ShareDataWith(*tensor).Resize(
           {static_cast<int64_t>(length)});
     } else {
+      // TODO(shenliang03): maybe save the memory
+      // by avoiding tensor construction
       if (!group_tensor.IsInitialized()) {
         group_tensor.Resize({static_cast<int64_t>(length)});
         group_tensor.mutable_data(place_, group.dtype_);
+      }
+
 #ifdef PADDLE_WITH_XPU_BKCL
-        if (platform::is_xpu_place(group_tensor.place())) {
-          // TODO(liuyuhui) support XPU set constant
-          VLOG(3) << "XPU doesn't support set_constant";
-        }
+      if (platform::is_xpu_place(group_tensor.place())) {
+        // TODO(liuyuhui) support XPU set constant
+        VLOG(3) << "XPU doesn't support set_constant";
+      }
 #else
-        auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      if (HasGrad(var_index)) {
+        auto var_base = vars_[var_index]->GradVarBase();
+        auto tensor =
+            var_base->MutableVar()->GetMutable<framework::LoDTensor>();
+        TensorCopy(*tensor, place_, *dev_ctx, &group_tensor);
+        group_tensor.Resize({static_cast<int64_t>(length)});
+      } else {
+        group_tensor.Resize({static_cast<int64_t>(length)});
         operators::math::set_constant(*dev_ctx, &group_tensor, 0.0);
-#endif
       }
+#endif
     }
   } else {
     // process sparse group
-    if (is_used_var) {
-      auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar();
-      group.sparse_contents_ = var_warpper->MutableVar();
-    } else {
-      group.sparse_contents_ = nullptr;
-    }
+    PADDLE_ENFORCE_EQ(HasGrad(var_index), true,
+                      platform::errors::PreconditionNotMet(
+                          "The sparse parameter[%d][%s] must have a gradient",
+                          var_index, vars_[var_index]->Name()));
+    auto var_base = vars_[var_index]->GradVarBase();
+    // need to check tensor type
+    PADDLE_ENFORCE_EQ(
+        var_base->Var().IsType<framework::SelectedRows>(), true,
+        platform::errors::PreconditionNotMet(
+            "The sparse parameter[%d][%s] must have a selectedrows gradient. "
+            "Before forward pass, the parameter type is inferred to be "
+            "SelectedRows, but after backward pass, its actual type becomes "
+            "LodTensor. It is currently not supported by DataParallel. "
+            "For example, if sparse embedding is used, and the weight of "
+            "embedding is shared with subsequent dense parameters, then "
+            "the parameter gradient of the embedding will be converted "
+            "to dense parameters.",
+            var_index, vars_[var_index]->Name()));
+
+    group.sparse_contents_ = var_base->MutableVar();
   }
 
   if (--group.pending_ == 0) {
@@ -641,6 +730,14 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
 // fixed as same as multi gpus card trainging.
 void Reducer::MarkGroupReady(size_t group_index) {
+  PADDLE_ENFORCE_GE(
+      group_index, next_group_,
+      platform::errors::PreconditionNotMet(
+          "The index of the incoming group must be greater "
+          "than or equal to the previously synchronized group index, "
+          "expect it to greater than or equal to %d, but got %d.",
+          next_group_, group_index));
+
   if (group_index > next_group_) {
     VLOG(3) << "It will adjust the order of group in next batch automatically";
     return;
@@ -649,7 +746,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
   for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
        ++next_group_) {
     auto &group = groups_[next_group_];
-    int run_order = next_group_ % nrings_;
+    const int run_order = next_group_ % nrings_;
 
     // For CUDA or XPU, compute_stream --> comm_stream.
     // For CPU, do nothing.
@@ -668,7 +765,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
     comm_pool_->enqueue([&] {
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
       platform::SetXPUDeviceId(dev_id);
-      FusedAllReduceSchedule(run_order, group);
+      FusedAllReduceSchedule(run_order, group, next_group_);
       {
         std::lock_guard<std::mutex> lock(mutex_);
         comm_op_count_ -= 1;  // lock
@@ -676,7 +773,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
       }
     });
 #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
-    FusedAllReduceSchedule(run_order, group);
+    FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "Not compiled with BKCL or NCCL."));
@@ -684,24 +781,23 @@ void Reducer::MarkGroupReady(size_t group_index) {
   }
 }
 
-void Reducer::FusedAllReduceSchedule(int run_order, Group &group) {
+void Reducer::FusedAllReduceSchedule(const int run_order, Group &group,
+                                     const int curr_group_index) {
+  // The overall timeline: concat > div_nranks > allreduce > split
+  // dev_context is used to select different stream
+  const auto &dev_context = *parallel_ctx_->GetDeviceContext(run_order);
   if (group.is_sparse_) {
-    if (group.sparse_contents_ != nullptr) {
-      VLOG(3) << "sparse group [" << next_group_ << "] start allreduce in ring["
-              << run_order << "]";
-      group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_);
-      parallel_ctx_->AllReduceByStream(
-          *group.sparse_contents_, group.sparse_contents_, run_order, false);
-    } else {
-      VLOG(3) << "The sparse group[" << next_group_
-              << "] has no var to allreduce";
-    }
+    VLOG(3) << "sparse group [" << curr_group_index
+            << "] start allreduce in ring[" << run_order << "]";
+    group.DivNRanks(dev_context, nranks_);
+    parallel_ctx_->AllReduceByStream(*group.sparse_contents_,
+                                     group.sparse_contents_, run_order, false);
   } else {
-    VLOG(3) << "dense group [" << next_group_ << "] start allreduce in ring["
-            << run_order << "]";
+    VLOG(3) << "dense group [" << curr_group_index
+            << "] start allreduce in ring[" << run_order << "]";
     // Select common commstream to concat tensors
     // group.dense_tensors ---> group.dense_contents_
-    group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));
+    group.ConcatTensors(dev_context);
 
 // NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
 // default stream for communicating, so there exist some problems in
@@ -713,15 +809,15 @@ void Reducer::FusedAllReduceSchedule(int run_order, Group &group) {
       parallel_ctx_->WaitComm(run_order);
     }
 #endif
-    group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_);
 
+    group.DivNRanks(dev_context, nranks_);
     // Start allreduce
     parallel_ctx_->AllReduceByStream(
         group.dense_contents_, &(group.dense_contents_), run_order, false);
 
-    // Select common commstream to split tensors
+    // Select communication stream to split tensors
     // group.dense_contents_ ---> group.dense_tensors
-    group.SplitTensors(*parallel_ctx_->GetDeviceContext(run_order));
+    group.SplitTensors(dev_context);
   }
 }
 
@@ -747,14 +843,98 @@ std::vector<std::vector<size_t>> Reducer::RebuildGruops() {
   return rebuild_group_indices;
 }
 
+void Reducer::ProcessUnusedDenseVars() {
+  // The calculation stream must be used here to
+  // avoid conflicts with communication.
+  VLOG(3) << "Local used vars : "
+          << string::join_strings(local_used_vars_, ',');
+  const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+  // H2D is to allreduce the local_used_vars_
+  auto *global_used_tensor =
+      global_used_vars_.GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector<int>(local_used_vars_, *dev_ctx,
+                                   global_used_tensor);
+  parallel_ctx_->AllReduceByStream(global_used_vars_, &global_used_vars_, 0,
+                                   true);
+  framework::TensorToVector<int>(*global_used_tensor, *dev_ctx,
+                                 &local_used_vars_);
+
+  // sync compute stream to get global used var message,
+  // but maybe affect speed performance
+  parallel_ctx_->SynchronizeCompute();
+  VLOG(3) << "Global used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  for (const auto var_index : unused_vars_) {
+    const bool global_unused = (local_used_vars_[var_index] == 0);
+
+    // global used but local unused, set grad
+    VLOG(3) << "Var [" << var_index << "] [" << vars_[var_index]->Name()
+            << "] global_unused:" << global_unused
+            << "  has grad: " << HasGrad(var_index);
+
+    if (!global_unused) {
+      VLOG(3) << "Start process unused Var";
+      // 1. source var base
+      const auto &var_locator = variable_locators_[var_index];
+      const auto group_index = var_locator.group_index;
+      const auto &group = groups_[group_index];
+      const auto inside_group_index = var_locator.inside_group_index;
+      const auto &src_tensor = group.dense_tensors_[inside_group_index];
+      // sparse no need to check and no support find_unused_parameters
+      if (group.is_sparse_) {
+        continue;
+      }
+      // 2. destination var base
+      auto dest_var_base = vars_[var_index];
+      auto *dest_tensor =
+          dest_var_base->MutableVar()->GetMutable<framework::LoDTensor>();
+      const auto &dest_dims = dest_tensor->dims();
+
+      // 3. create grad var base or get grad var base
+      auto grad_var_base_tmp = dest_var_base->MutableGradVarBase();
+
+      // 4. set grad tensor
+      auto *dest_grad_tensor =
+          grad_var_base_tmp->MutableVar()->GetMutable<framework::LoDTensor>();
+      const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor);
+      dest_grad_tensor->Resize(dest_dims);
+    }
+  }
+}
+
+bool Reducer::HasGrad(size_t var_index) {
+  const auto grad_var = vars_[var_index]->GradVarBase();
+  if (!grad_var || !grad_var->Var().IsInitialized()) {
+    return false;
+  }
+
+  const auto &var = grad_var->Var();
+  if (var.IsType<framework::LoDTensor>()) {
+    if (var.Get<framework::LoDTensor>().IsInitialized()) {
+      return true;
+    }
+  } else if (var.IsType<framework::SelectedRows>()) {
+    if (var.Get<framework::SelectedRows>().value().IsInitialized()) {
+      return true;
+    }
+  } else {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Only support LoDTensor and SelectedRows for gradient var"));
+  }
+  return false;
+}
+
 void Reducer::FinalizeBackward() {
-  all_group_ready_ = false;
+  groups_need_finalize_ = false;
 #ifdef PADDLE_WITH_XPU_BKCL
   {
     std::unique_lock<std::mutex> lock(mutex_);
     cv_.wait(lock, [&] { return comm_op_count_ == 0; });
   }
 #endif
+
   // Must prevent compute_stream_ starting until all comm streams have finished
   for (int i = 0; i < nrings_; ++i) {
     parallel_ctx_->WaitComm(i);
@@ -767,7 +947,18 @@ void Reducer::FinalizeBackward() {
     InitializeGroups(group_indices_);
   }
 
-  VLOG(3) << "In the batch, Reducer is finished...";
+  if (find_unused_vars_) {
+// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    ProcessUnusedDenseVars();
+#endif
+    // Initialize local used vars
+    local_used_vars_.clear();
+    local_used_vars_.resize(vars_.size(), 0);
+    VLOG(3) << "ProcessUnusedDenseVars is finished.";
+  }
+
+  VLOG(3) << "In the batch, Reducer is finished.";
 }
 
 // According to the size of each parameter, it is allocated to different groups.
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index b2680d0dea71aa..0d613dbea89633 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -27,6 +27,7 @@
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -153,13 +154,20 @@ class Reducer {
 
   void MarkGroupReady(size_t group_index);
 
-  void FusedAllReduceSchedule(int run_order, Group& group);  // NOLINT
+  void FusedAllReduceSchedule(const int run_order, Group& group,  // NOLINT
+                              const int curr_group_index);
 
   void FinalizeBackward();
 
   std::vector<std::vector<size_t>> RebuildGruops();
 
-  inline bool NeedRebuildGroup() { return !has_rebuilt_group_; }
+  inline bool NeedRebuildGroup() {
+    return !has_rebuilt_group_ && !find_unused_vars_;
+  }
+
+  void ProcessUnusedDenseVars();
+
+  bool HasGrad(size_t var_index);
 
  private:
   std::vector<std::shared_ptr<imperative::VarBase>> vars_;
@@ -188,7 +196,7 @@ class Reducer {
   std::vector<size_t> unused_vars_;
   bool has_marked_unused_vars_{false};
   bool find_unused_vars_{false};
-  bool all_group_ready_{false};
+  bool groups_need_finalize_{false};
 #ifdef PADDLE_WITH_XPU_BKCL
   // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training.
   std::unique_ptr<::ThreadPool> comm_pool_{nullptr};
@@ -196,6 +204,19 @@ class Reducer {
   std::mutex mutex_;
   std::condition_variable cv_;
 #endif
+
+  // it just for checking hook, each parameter can only trigger one hook
+  std::vector<bool> vars_marked_ready_;
+
+  // Following variables are to help control flow.
+  // local_used_vars_ uses 0/1 to indicate whether the
+  // var is used in iteration. After the end of the
+  // iteration, global_used_vars_ is obtained synchronously
+  // globally. Choose whether to update the local
+  // gradient according to the global_used_vars_.
+  std::vector<int> local_used_vars_;
+  // global_used_vars_ is used in comm stream to avoid wait
+  framework::Variable global_used_vars_;
 };
 
 std::vector<std::vector<size_t>> AssignGroupBySize(
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 7bf5f876681bab..5c4e1538cf0538 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -37,6 +37,30 @@ namespace imperative {
 using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
 using var_pair = std::pair<std::string, vb_vector>;
 
+std::shared_ptr<imperative::VariableWrapper> DoubleHook(
+    const std::shared_ptr<imperative::VariableWrapper>& var) {
+  // 1. create out var
+  auto out_var = std::make_shared<imperative::VariableWrapper>(var->Name());
+  out_var->SetType(var->Type());
+  out_var->SetDataType(var->DataType());
+  out_var->SetForwardDataType(var->ForwardDataType());
+  out_var->InnerSetOverridedStopGradient(var->InnerOverridedStopGradient());
+
+  // 2. get input and output var's tensor
+  auto* out_tensor = out_var->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto& tensor = var->Var().Get<framework::LoDTensor>();
+  out_tensor->Resize(tensor.dims());
+
+  // 3. double calc
+  auto* data = tensor.data<float>();
+  auto* out_data = out_tensor->mutable_data<float>(platform::CPUPlace());
+  for (int64_t i = 0; i < out_tensor->numel(); ++i) {
+    out_data[i] = data[i] * 2.0;
+  }
+
+  return out_var;
+}
+
 TEST(TestHooks, TestGradVarLeafBackwardHook) {
   // 1. prepare
   Tracer tracer;
@@ -73,17 +97,14 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
 
-  // add GradAccumulatorPostHook
-  auto x_var_wrapper = x->SharedVar();
-  x_var_wrapper->AddGradVarLeafBackwardHook(
-      std::unique_ptr<LambdaGradAccumulatorPostHook>(
-          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
-            auto* grad_tensor =
-                grad->MutableVar()->GetMutable<framework::LoDTensor>();
-            for (int i = 0; i < grad_tensor->numel(); ++i) {
-              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
-            }
-          })));
+  // add VariableWrapper hook
+  x->GradVarBase()->AddVariableWrapperHook(
+      std::make_shared<imperative::CppVariableWrapperHook>(DoubleHook));
+
+  // add Void hook
+  int64_t hook_value = 0;
+  x->GradVarBase()->AddVoidHook(
+      std::make_shared<std::function<void()>>([&]() { hook_value = 10; }));
 
   // 2. forward
   tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
@@ -93,16 +114,21 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
+  // verify VariableWrapper hook result
   framework::LoDTensor x_grad;
   framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
                             &x_grad);
   for (int i = 0; i < x_grad.numel(); ++i) {
     ASSERT_EQ(x_grad.data<float>()[i], 8.0);
   }
+  // verify Void hook result
+  ASSERT_EQ(hook_value, 10);
 
   framework::LoDTensor y_grad;
   framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
@@ -151,17 +177,14 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   memory::Copy(place, mutable_z, place, src_data.data(),
                sizeof(float) * src_data.size());
 
-  // add GradAccumulatorPostHook
-  auto x_var_wrapper = x->SharedVar();
-  x_var_wrapper->AddGradVarLeafBackwardHook(
-      std::unique_ptr<LambdaGradAccumulatorPostHook>(
-          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
-            auto* grad_tensor =
-                grad->MutableVar()->GetMutable<framework::LoDTensor>();
-            for (int i = 0; i < grad_tensor->numel(); ++i) {
-              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
-            }
-          })));
+  // add VariableWrapper hook
+  x->GradVarBase()->AddVariableWrapperHook(
+      std::make_shared<imperative::CppVariableWrapperHook>(DoubleHook));
+
+  // add Void hook
+  int64_t hook_value = 0;
+  x->GradVarBase()->AddVoidHook(
+      std::make_shared<std::function<void()>>([&]() { hook_value = 100; }));
 
   // 2. forward
   var_pair x_pair = var_pair("X", vb_vector(1, x));
@@ -193,16 +216,21 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
+  // verify VariableWrapper hook result
   framework::LoDTensor x_grad;
   framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
                             &x_grad);
   for (int i = 0; i < x_grad.numel(); ++i) {
     ASSERT_EQ(x_grad.data<float>()[i], 16.0);
   }
+  // verify Void hook result
+  ASSERT_EQ(hook_value, 100);
 
   framework::LoDTensor y_grad;
   framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 9e3b0ea5df6838..76de413b3e6033 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get());
+
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{reduce_sum_out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{vout};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   imperative::BasicEngine engine;
-  engine.Init(vout.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   // check the grad
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 608cc407d5b776..777cb10e0754c3 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -38,7 +38,7 @@ void SetCurrentTracer(const std::shared_ptr<Tracer>& tracer) {
   VLOG(6) << "Set current tracer: " << g_current_tracer;
 }
 
-static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
+void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
   for (const auto& pair : outs) {
     for (const auto& var : pair.second) {
       // NOTE(zhiqiu): this happends when None output are passed from python
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index b10d1b2d0b49da..8f50550878262f 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -130,5 +130,7 @@ void IncreaseVarbaseReferenceCountUntilCopyComplete(
     const std::shared_ptr<imperative::VarBase>& var,
     const platform::Place& place);
 
+void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad);
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index b42f25dcc88001..5fa8b89a396d9b 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -27,8 +27,8 @@
 namespace paddle {
 namespace imperative {
 
-class InteriorVarHookPipeline;
-class LeafVarHookPipeline;
+class VariableWrapperHook;
+class InplaceVariableWrapperHook;
 class VarBase;
 class GradOpNode;
 
@@ -38,6 +38,9 @@ class VariableWrapper {
 
   explicit VariableWrapper(const std::string& name) : name_(name) {}
 
+  VariableWrapper(const std::string& name, const framework::Variable& variable)
+      : var_(variable), name_(name) {}
+
   ~VariableWrapper() { VLOG(10) << "Destruct VariableWrapper: " << Name(); }
 
   const framework::Variable& Var() const { return var_; }
@@ -193,42 +196,6 @@ class VariableWrapper {
     }
   }
 
-  /* Hook related method: only can be call by GradVarBase */
-
-  bool HasInteriorHooks() const { return interior_hooks_ != nullptr; }
-
-  bool HasLeafHooks() const { return leaf_hooks_ != nullptr; }
-
-  void AddGradVarInteriorHook(std::unique_ptr<OpBasePreHook>&& hook) {
-    auto interior_hooks = GetGradVarInteriorHooksSafely();
-    interior_hooks->add_hook(std::move(hook));
-  }
-
-  void AddGradVarLeafHook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    auto leaf_hooks = GetGradVarLeafHooksSafely();
-    leaf_hooks->add_hook(std::move(hook));
-  }
-
-  void AddGradVarLeafBackwardHook(
-      std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    auto leaf_hooks = GetGradVarLeafHooksSafely();
-    leaf_hooks->add_backward_hook(std::move(hook));
-  }
-
-  const std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() const {
-    return interior_hooks_;
-  }
-
-  std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() {
-    return interior_hooks_;
-  }
-
-  const std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() const {
-    return leaf_hooks_;
-  }
-
-  std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() { return leaf_hooks_; }
-
   uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; }
 
   void ResetInplaceVersion() {
@@ -255,6 +222,38 @@ class VariableWrapper {
     return;
   }
 
+  /* Hook related methods */
+  bool HasVariableWrapperHook() const { return !var_hooks_.empty(); }
+
+  int64_t AddVariableWrapperHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    var_hooks_.emplace(next_hook_id_, std::move(hook));
+    return next_hook_id_++;
+  }
+
+  bool RemoveVariableWrapperHook(const int64_t& hook_id) {
+    auto remove_cnt = var_hooks_.erase(hook_id);
+    if (remove_cnt == 0) {
+      return false;
+    }
+    return true;
+  }
+
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>&
+  GetVariableWrapperHooks() const {
+    return var_hooks_;
+  }
+
+  bool HasVoidHook() const { return !void_hooks_.empty(); }
+
+  void AddVoidHook(std::shared_ptr<std::function<void()>>&& hook) {
+    void_hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::shared_ptr<std::function<void()>>>& GetVoidHooks()
+      const {
+    return void_hooks_;
+  }
+
  private:
   void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
     auto shared_var = grad_var_.lock();
@@ -289,41 +288,6 @@ class VariableWrapper {
     }
   }
 
-  /* Hook related private methods */
-  std::shared_ptr<VariableWrapper> GetGradVarSafely() const {
-    auto shared_grad_var = grad_var_.lock();
-    PADDLE_ENFORCE_NOT_NULL(
-        shared_grad_var,
-        platform::errors::PermissionDenied(
-            "Cannot add gradient hook on Tensor without gradient."));
-    return shared_grad_var;
-  }
-
-  std::shared_ptr<InteriorVarHookPipeline>& GetGradVarInteriorHooksSafely() {
-    auto shared_grad_var = GetGradVarSafely();
-    PADDLE_ENFORCE_EQ(HasGradNode(), true,
-                      platform::errors::PermissionDenied(
-                          "Only interior Tensor in backward can register "
-                          "interior gradient hook."));
-    if (shared_grad_var->interior_hooks_ == nullptr) {
-      shared_grad_var->interior_hooks_ =
-          std::make_shared<InteriorVarHookPipeline>();
-    }
-    return shared_grad_var->interior_hooks_;
-  }
-
-  std::shared_ptr<LeafVarHookPipeline>& GetGradVarLeafHooksSafely() {
-    auto shared_grad_var = GetGradVarSafely();
-    PADDLE_ENFORCE_EQ(
-        HasGradNode(), false,
-        platform::errors::PermissionDenied(
-            "Only leaf Tensor in backward can register leaf gradient hook."));
-    if (shared_grad_var->leaf_hooks_ == nullptr) {
-      shared_grad_var->leaf_hooks_ = std::make_shared<LeafVarHookPipeline>();
-    }
-    return shared_grad_var->leaf_hooks_;
-  }
-
  private:
   framework::Variable var_;
   std::string name_;
@@ -358,11 +322,19 @@ class VariableWrapper {
   // isn't need
   bool is_empty_{false};
 
-  // NOTE: only grad var can hold hooks now
-  // only interior var can hold interior hooks
-  std::shared_ptr<InteriorVarHookPipeline> interior_hooks_;
-  // only leaf var can hold leaf hooks
-  std::shared_ptr<LeafVarHookPipeline> leaf_hooks_;
+  // NOTE(chenweihang): only grad var will hold hooks now
+  int64_t next_hook_id_{0};
+  // [ Hooks with VariableWrapper as input and output ]
+  // NOTE: Now registered for grad var, support adding and removing,
+  // key is the accumulated int64_t value
+  // NOTE: Var hook need to support removing, so need hook id
+  std::map<int64_t, std::shared_ptr<VariableWrapperHook>> var_hooks_;
+  // [ Hooks without input and output ]
+  // NOTE: Now registered after the execution of the entire backward
+  // process is over, currently only used for reducing in distributed
+  // training
+  // NOTE: Now no need to support remove void hook
+  std::vector<std::shared_ptr<std::function<void()>>> void_hooks_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 9a4637306bb359..03f86cc7ba6de6 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -57,11 +57,9 @@ if(WITH_TESTING)
   if (NOT APPLE AND NOT WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared
       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
-    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
-    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
   endif()
 
 endif()
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7bb092d0e3c1c0..4b6c746d57525a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1192,6 +1192,8 @@ USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
+USE_TRT_CONVERTER(anchor_generator);
+USE_TRT_CONVERTER(yolo_box);
 USE_TRT_CONVERTER(roi_align);
 USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
index 0d9f3d2aa237ac..c265721db57752 100755
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -x
 cd `dirname $0`
 rm -rf build/ data/
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 61fcdb7a90830d..1d77ddaf73ef70 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -86,6 +86,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "simplify_with_basic_ops_pass",           //
       "embedding_eltwise_layernorm_fuse_pass",  //
       "multihead_matmul_fuse_pass_v2",          //
+      "multihead_matmul_fuse_pass_v3",          //
       "skip_layernorm_fuse_pass",               //
       "conv_bn_fuse_pass",                      //
       "unsqueeze2_eltwise_fuse_pass",           //
@@ -235,8 +236,8 @@ void CpuPassStrategy::EnableMKLDNN() {
              "reshape_transpose_matmul_mkldnn_fuse_pass",  //
              "matmul_transpose_reshape_fuse_pass",         //
              // Disabled due to topology-dependent speed-up
-             //"fc_mkldnn_pass",
-             //"fc_act_mkldnn_fuse_pass",
+             // "fc_mkldnn_pass",
+             // "fc_act_mkldnn_fuse_pass",
              "batch_norm_act_fuse_pass",
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index bc7b7355ea1922..3820ac5d7cc246 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -6,6 +6,8 @@ nv_library(tensorrt_converter
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
+                anchor_generator_op.cc
+                yolo_box_op.cc
                 roi_align_op.cc
                 affine_channel_op.cc
                 multiclass_nms_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc
new file mode 100644
index 00000000000000..56aab9785c90f3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/* Anchor Generator Op */
+class AnchorGeneratorOpConverter : public OpConverter {
+ public:
+  void operator()(const paddle::framework::proto::OpDesc& op,
+                  const paddle::framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a fluid anchor generator op to tensorrt plugin";
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("Input").front();
+    std::string anchor_name = op_desc.Output("Anchors").front();
+    std::string variance_name = op_desc.Output("Variances").front();
+
+    auto* input = engine_->GetITensor(input_name);
+    const auto input_dims = input->getDimensions();  // C, H, W
+    std::vector<std::string> output_names{anchor_name, variance_name};
+
+    const auto anchor_sizes =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("anchor_sizes"));
+    const auto aspect_ratios =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("aspect_ratios"));
+    const auto stride =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("stride"));
+    const auto variances =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("variances"));
+    const auto offset = BOOST_GET_CONST(float, op_desc.GetAttr("offset"));
+    const int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+    bool is_dynamic = engine_->with_dynamic_shape();
+    const auto height = input_dims.d[1];
+    const auto width = input_dims.d[2];
+    const int box_num = width * height * num_anchors;
+    const nvinfer1::DataType data_type = nvinfer1::DataType::kFLOAT;
+
+    nvinfer1::IPluginV2* anchor_generator_plugin = nullptr;
+    if (is_dynamic) {
+      anchor_generator_plugin = new plugin::AnchorGeneratorPluginDynamic(
+          data_type, anchor_sizes, aspect_ratios, stride, variances, offset,
+          num_anchors);
+    } else {
+      anchor_generator_plugin = new plugin::AnchorGeneratorPlugin(
+          data_type, anchor_sizes, aspect_ratios, stride, variances, offset,
+          height, width, num_anchors, box_num);
+    }
+
+    std::vector<nvinfer1::ITensor*> anchor_generator_inputs{input};
+    auto* anchor_generator_layer = engine_->network()->addPluginV2(
+        anchor_generator_inputs.data(), anchor_generator_inputs.size(),
+        *anchor_generator_plugin);
+
+    RreplenishLayerAndOutput(anchor_generator_layer, "anchor_generator",
+                             output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(anchor_generator, AnchorGeneratorOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 26cd7b22d2baaa..a6484a13557052 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -158,17 +158,49 @@ class BatchNormOpConverter : public OpConverter {
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
 
-    nvinfer1::IScaleLayer* layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast<nvinfer1::ITensor*>(X),
-                             nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
-                             scale_weights.get(), power_weights.get());
+    int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
+    nvinfer1::ILayer* layer = nullptr;
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+
+    auto x_dim = X->getDimensions();
+    if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      nvinfer1::Dims expand_shape;
+      expand_shape.nbDims = 3 + dynamic_shape_offset;
+      for (int i = 0; i < 3 + dynamic_shape_offset; i++) {
+        if (i < x_dim.nbDims) {
+          expand_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i];
+        } else {
+          expand_shape.d[i] = 1;
+        }
+      }
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      expand_layer->setReshapeDimensions(expand_shape);
+      X = expand_layer->getOutput(0);
+    }
+
+    layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *X, nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
+        scale_weights.get(), power_weights.get());
 
     auto output_name = op_desc.Output("Y").front();
     engine_->SetWeights(op_desc.Input("Bias").front(),
                         std::move(combile_bias_tensor));
     engine_->SetWeights(op_desc.Input("Scale").front(),
                         std::move(combile_scale_tensor));
-    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
+    if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      nvinfer1::Dims squeeze_shape;
+      squeeze_shape.nbDims = x_dim.nbDims;
+      for (int i = 0; i < squeeze_shape.nbDims; i++) {
+        squeeze_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i];
+      }
+      squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+      squeeze_layer->setReshapeDimensions(squeeze_shape);
+      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+    }
+    RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name},
+                             test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index dfadb28a6520f9..74057addecd1f9 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -251,7 +251,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
 #if IS_TRT_VERSION_GE(6000)
         plugin::ElementwisePluginDynamic* plugin =
             new plugin::ElementwisePluginDynamic(op_type_, axis);
-        layer = engine_->AddPluginV2(itensors.data(), 2, plugin);
+        layer = engine_->AddDynamicPlugin(itensors.data(), 2, plugin);
 #else
         PADDLE_THROW(platform::errors::Fatal(
             "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 7f8843a3f67d05..f13f1724541239 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -31,7 +31,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
 #if IS_TRT_VERSION_GE(6000)
-    VLOG(4) << "convert fluid swish op to tensorrt layer";
+    VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
     auto id_names = op_desc.Input("Ids");
@@ -89,10 +89,14 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     int64_t bias_size = framework::product(bias_dims);
     int64_t scale_size = framework::product(scale_dims);
     nvinfer1::ILayer* layer = nullptr;
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     if (engine_->with_dynamic_shape()) {
       if (engine_->use_oss()) {
         int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
+        if (enable_int8) {
+          output_fp16 = 1;
+        }
         PADDLE_ENFORCE_EQ(
             output_fp16, 1,
             platform::errors::InvalidArgument(
@@ -169,7 +173,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
         plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
             input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
             eps, with_fp16);
-        layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
+        layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
         auto output_name = op_desc.Output("Out")[0];
         RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
                                  test_mode);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 527d0ee208578a..194d76c737c7f9 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -106,8 +106,22 @@ class FcOpConverter : public OpConverter {
     auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                          TensorRTEngine::Weight& weight,
                          TensorRTEngine::Weight& bias) {
-      auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
-                                            n_output, weight.get(), bias.get());
+      nvinfer1::ILayer* fc_layer = nullptr;
+      if (enable_int8) {
+        PADDLE_ENFORCE_EQ(
+            op_desc.HasAttr("out_threshold"), true,
+            platform::errors::InvalidArgument(
+                "must have out threshold in fc layers in int8 mode"));
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        nvinfer1::DimsHW nv_ksize(1, 1);
+        fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                        nv_ksize, weight.get(), bias.get());
+        engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+      } else {
+        fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
+                                        n_output, weight.get(), bias.get());
+      }
 
       auto output_name = op_desc.Output("Out").front();
       if (activation_type == "relu") {
@@ -229,13 +243,24 @@ class FcOpConverter : public OpConverter {
                 "dims equals to 4, the last dim of input must be 1, but got %d",
                 input_d[3]));
       }
-      for (int i = 0; i < 3; i++) {
-        if (i < input_dims) {
-          reshape_dim3[i] = input_d[i];
-        } else {
-          reshape_dim3[i] = 1;
+      if (enable_int8) {
+        reshape_dim3[0] = 1;
+        for (int i = 0; i < 3; i++) {
+          reshape_dim3[0] *= input_d[i];
+          if (i > 0) {
+            reshape_dim3[i] = 1;
+          }
+        }
+      } else {
+        for (int i = 0; i < 3; i++) {
+          if (i < input_dims) {
+            reshape_dim3[i] = input_d[i];
+          } else {
+            reshape_dim3[i] = 1;
+          }
         }
       }
+
       nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1],
                                   reshape_dim3[2]);
       auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
@@ -249,11 +274,25 @@ class FcOpConverter : public OpConverter {
                         platform::errors::InvalidArgument(
                             "Invalid dimensions. When x_num_col_dims equals to "
                             "2, input_dims should not be 1"));
-      for (int i = 0; i < 4; i++) {
-        if (i < input_dims) {
-          reshape_dim4[i] = input_d[i];
-        } else {
-          reshape_dim4[i] = 1;
+
+      if (enable_int8) {
+        for (int i = 0; i < 4; i++) {
+          if (i == 0) {
+            reshape_dim4[i] = input_d[i];
+          } else {
+            reshape_dim4[i] = 1;
+            if (i < input_dims) {
+              reshape_dim4[1] *= input_d[i];
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < 4; i++) {
+          if (i < input_dims) {
+            reshape_dim4[i] = input_d[i];
+          } else {
+            reshape_dim4[i] = 1;
+          }
         }
       }
       nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1],
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index 4c9996ca02cad4..ca5b6a8b52e797 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -64,7 +64,7 @@ class GeluOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::GeluPluginDynamic* plugin =
           new plugin::GeluPluginDynamic(with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 8ce46a19d4b06e..f2f45c694ab44f 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -40,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter {
     auto* bias_v = scope.FindVar(bias_name);
     auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
 
-    float* weight_data =
-        engine_->GetWeightCPUData(weight_name, weight_t, false);
+    float* weight_data = nullptr;
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    float in_scale = 0.;
+
+    if (enable_int8) {
+      PADDLE_ENFORCE_EQ(
+          op_desc.HasAttr("Input_scale"), true,
+          platform::errors::InvalidArgument(
+              "must have input scale in multihead layers in int8 mode"));
+      in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
+      auto weight_scale =
+          BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("weight_scale"));
+      weight_data =
+          engine_->GetWeightCPUData(weight_name, weight_t, true, weight_scale);
+      engine_->SetTensorDynamicRange(input, in_scale);
+    } else {
+      weight_data = engine_->GetWeightCPUData(weight_name, weight_t, false);
+    }
+
     float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t, false);
     std::vector<float> weight_data_tmp;
     weight_data_tmp.reserve(weight_t->numel());
@@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                static_cast<void*>(bias_data),
                                static_cast<int32_t>(bias_t->numel())};
 
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
-                                              n, weight, bias);
+        nvinfer1::ILayer* fc_layer = nullptr;
+        float dp_probs = 1.0 / 127.0;
+        if (enable_int8) {
+          nvinfer1::DimsHW nv_ksize(1, 1);
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n,
+                                          nv_ksize, weight, bias);
+        } else {
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
+                                          weight, bias);
+        }
+
+        if (enable_int8) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+          dp_probs = out_scale / 127.0;
+        }
 
         auto mask_tensor = engine_->GetITensor("qkv_plugin_mask");
 
@@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
         int type = static_cast<int>((engine_->WithFp16() == 1)
                                         ? nvinfer1::DataType::kHALF
                                         : nvinfer1::DataType::kFLOAT);
+        if (enable_int8) {
+          type = static_cast<int>(nvinfer1::DataType::kHALF);
+        }
         bool has_mask = true;
         int var_seqlen = 1;
         const std::vector<nvinfer1::PluginField> fields{
@@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
             {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
             {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
             {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1},
-        };
+            { "dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1 }};
         nvinfer1::PluginFieldCollection* plugin_collection =
             static_cast<nvinfer1::PluginFieldCollection*>(
                 malloc(sizeof(*plugin_collection) +
@@ -227,7 +266,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
         plugin::DynamicPluginTensorRT* plugin =
             new plugin::QkvToContextPluginDynamic(hidden_in, head_number,
                                                   head_size, scale, with_fp16);
-        layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin);
+        layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
       }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index aa4e54b5845722..c10072602d7c51 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -147,7 +147,7 @@ class Pool2dOpConverter : public OpConverter {
         plugin::PoolPluginDynamic *plugin =
             new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize,
                                           strides, paddings, global_pooling);
-        layer = engine_->AddPluginV2(&input1, 1, plugin);
+        layer = engine_->AddDynamicPlugin(&input1, 1, plugin);
 #endif
       }
       auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 5e881ecbbc4e2c..74d77d8be44937 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -65,7 +65,7 @@ class PReluOpConverter : public OpConverter {
 #if IS_TRT_VERSION_GE(6000)
       plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic(
           alpha_data, alpha_tensor_temp->numel(), mode);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 2e4a4e6120d2d8..b44bdcef7123c2 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -49,6 +49,7 @@ class SkipLayerNormOpConverter : public OpConverter {
     auto* scale = get_persistable_data("Scale", &scale_dims);
     int bias_size = framework::product(bias_dims);
     int scale_size = framework::product(scale_dims);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
@@ -62,6 +63,10 @@ class SkipLayerNormOpConverter : public OpConverter {
         int ld = input1->getDimensions().d[2];  // hidden dimension
         assert(ld > 0);
 
+        if (enable_int8) {
+          type = static_cast<int>(nvinfer1::DataType::kHALF);
+        }
+
         const std::vector<nvinfer1::PluginField> fields{
             {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
             {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
@@ -90,7 +95,7 @@ class SkipLayerNormOpConverter : public OpConverter {
         plugin::SkipLayerNormPluginDynamic* plugin =
             new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
                                                    scale_size, eps, with_fp16);
-        layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
+        layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
       }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 0bd2b8c9bf5eef..aee39b7cf0c14c 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter {
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
 
+    if (op_desc.HasAttr("out_threshold")) {
+      float out_scale =
+          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      engine_->SetTensorDynamicRange(input, out_scale);
+    }
+
     std::vector<int> axes =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("axes"));
     std::vector<int> starts =
@@ -90,14 +96,14 @@ class SliceOpConverter : public OpConverter {
         // bool ban_fp16 = engine_->disable_trt_plugin_fp16();
         plugin::SpecialSlicePluginDynamic* plugin =
             new plugin::SpecialSlicePluginDynamic();
-        layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
-                                     plugin);
+        layer = engine_->AddDynamicPlugin(plugin_inputs.data(),
+                                          plugin_inputs.size(), plugin);
       } else {
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
         plugin::SlicePluginDynamic* plugin =
             new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16);
-        layer = engine_->AddPluginV2(&input, 1, plugin);
+        layer = engine_->AddDynamicPlugin(&input, 1, plugin);
       }
 #else
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 5d494c2093b2a9..75b317e7bfd90e 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -90,7 +90,7 @@ class SplitOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SplitPluginDynamic* plugin =
           new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
index 1c971fa12e27e8..a0292b21124633 100644
--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter {
 
     for (int i = 0; i < input_num; ++i) {
       inputs[i] = engine_->GetITensor(input[i]);
+      if (op_desc.HasAttr("out_threshold")) {
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        engine_->SetTensorDynamicRange(inputs[i], out_scale);
+      }
     }
 
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
@@ -59,7 +64,7 @@ class StackOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::StackPluginDynamic* plugin =
           new plugin::StackPluginDynamic(axis, input_num, with_fp16);
-      layer = engine_->AddPluginV2(inputs, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
       assert(layer != nullptr);
 #else
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index 25944a2fead6cd..b2e394d14eba23 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -65,7 +65,7 @@ class SwishOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SwishPluginDynamic* plugin =
           new plugin::SwishPluginDynamic(beta, with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
new file mode 100644
index 00000000000000..2d12eaf736b754
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class YoloBoxOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid yolo box op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string X = op_desc.Input("X").front();
+    std::string img_size = op_desc.Input("ImgSize").front();
+
+    auto* X_tensor = engine_->GetITensor(X);
+    auto* img_size_tensor = engine_->GetITensor(img_size);
+
+    int class_num = BOOST_GET_CONST(int, op_desc.GetAttr("class_num"));
+    std::vector<int> anchors =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("anchors"));
+
+    int downsample_ratio =
+        BOOST_GET_CONST(int, op_desc.GetAttr("downsample_ratio"));
+    float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh"));
+    bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox"));
+    float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y"));
+
+    int type_id = static_cast<int>(engine_->WithFp16());
+    auto input_dim = X_tensor->getDimensions();
+    auto* yolo_box_plugin = new plugin::YoloBoxPlugin(
+        type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+        anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y,
+        input_dim.d[1], input_dim.d[2]);
+
+    std::vector<nvinfer1::ITensor*> yolo_box_inputs;
+    yolo_box_inputs.push_back(X_tensor);
+    yolo_box_inputs.push_back(img_size_tensor);
+
+    auto* yolo_box_layer = engine_->network()->addPluginV2(
+        yolo_box_inputs.data(), yolo_box_inputs.size(), *yolo_box_plugin);
+
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Boxes").front());
+    output_names.push_back(op_desc.Output("Scores").front());
+
+    RreplenishLayerAndOutput(yolo_box_layer, "yolo_box", output_names,
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(yolo_box, YoloBoxOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index de2924824f09de..2358e1ef976cdb 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -378,9 +378,9 @@ class TensorRTEngine {
   bool with_dynamic_shape() { return with_dynamic_shape_; }
 
 #if IS_TRT_VERSION_GE(6000)
-  nvinfer1::IPluginV2Layer* AddPluginV2(nvinfer1::ITensor* const* inputs,
-                                        int num_inputs,
-                                        plugin::DynamicPluginTensorRT* plugin) {
+  nvinfer1::IPluginV2Layer* AddDynamicPlugin(
+      nvinfer1::ITensor* const* inputs, int num_inputs,
+      plugin::DynamicPluginTensorRT* plugin) {
     owned_pluginv2_.emplace_back(plugin);
     return network()->addPluginV2(inputs, num_inputs, *plugin);
   }
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 7c1b2e8001edbd..53225b79780773 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -45,6 +45,12 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
 #if IS_TRT_VERSION_GE(7130)
     teller_set.insert("group_norm");
+    int8_teller_set.insert("multihead_matmul");
+    int8_teller_set.insert("skip_layernorm");
+    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
+    int8_teller_set.insert("matmul");
+    int8_teller_set.insert("stack");
+    int8_teller_set.insert("slice");
 #endif
   }
 
@@ -111,10 +117,11 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
+      "yolo_box",
       "roi_align",
       "affine_channel",
-      "multiclass_nms",
       "nearest_interp",
+      "anchor_generator",
   };
 };
 
@@ -198,6 +205,15 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
 
+    if (op_type == "yolo_box") {
+      if (with_dynamic_shape) return false;
+      bool has_attrs =
+          (desc.HasAttr("class_num") && desc.HasAttr("anchors") &&
+           desc.HasAttr("downsample_ratio") && desc.HasAttr("conf_thresh") &&
+           desc.HasAttr("clip_bbox") && desc.HasAttr("scale_x_y"));
+      if (!has_attrs) return false;
+    }
+
     if (op_type == "affine_channel") {
       if (!desc.HasAttr("data_layout")) return false;
       auto data_layout = framework::StringToDataLayout(
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 4107f9ef674339..1804e6c5571d3a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -5,6 +5,8 @@ nv_library(tensorrt_plugin
            instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
            qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
            hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
+           anchor_generator_op_plugin.cu
+           yolo_box_op_plugin.cu
            roi_align_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
new file mode 100644
index 00000000000000..01ee86ceb48a9e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
@@ -0,0 +1,566 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cassert>
+
+#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#define PrepareParamsOnDevice()                                          \
+  constexpr int data_size = 4;                                           \
+  cudaMalloc(&anchor_sizes_device_, anchor_sizes_.size() * data_size);   \
+  cudaMalloc(&aspect_ratios_device_, aspect_ratios_.size() * data_size); \
+  cudaMalloc(&stride_device_, stride_.size() * data_size);               \
+  cudaMalloc(&variances_device_, variances_.size() * data_size);         \
+  cudaMemcpy(anchor_sizes_device_, anchor_sizes_.data(),                 \
+             anchor_sizes_.size() * data_size, cudaMemcpyHostToDevice);  \
+  cudaMemcpy(aspect_ratios_device_, aspect_ratios_.data(),               \
+             aspect_ratios_.size() * data_size, cudaMemcpyHostToDevice); \
+  cudaMemcpy(stride_device_, stride_.data(), stride_.size() * data_size, \
+             cudaMemcpyHostToDevice);                                    \
+  cudaMemcpy(variances_device_, variances_.data(),                       \
+             variances_.size() * data_size, cudaMemcpyHostToDevice);
+
+AnchorGeneratorPlugin::AnchorGeneratorPlugin(
+    const nvinfer1::DataType data_type, const std::vector<float>& anchor_sizes,
+    const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+    const std::vector<float>& variances, const float offset, const int height,
+    const int width, const int num_anchors, const int box_num)
+    : data_type_(data_type),
+      anchor_sizes_(anchor_sizes),
+      aspect_ratios_(aspect_ratios),
+      stride_(stride),
+      variances_(variances),
+      offset_(offset),
+      height_(height),
+      width_(width),
+      num_anchors_(num_anchors),
+      box_num_(box_num) {
+  // anchors must be float32, which is the generator proposals' input
+  PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts float32."));
+  PADDLE_ENFORCE_GE(height_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts height "
+                        "greater than 0, but receive height = %d.",
+                        height_));
+  PADDLE_ENFORCE_GE(width_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts width "
+                        "greater than 0, but receive width = %d.",
+                        width_));
+  PADDLE_ENFORCE_GE(
+      num_anchors_, 0,
+      platform::errors::InvalidArgument(
+          "TRT anchor generator plugin only accepts number of anchors greater "
+          "than 0, but receive number of anchors = %d.",
+          num_anchors_));
+  PADDLE_ENFORCE_GE(box_num_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts box_num "
+                        "greater than 0, but receive box_num = %d.",
+                        box_num_));
+  PrepareParamsOnDevice();
+}
+
+AnchorGeneratorPlugin::~AnchorGeneratorPlugin() {
+  auto release_device_ptr = [](void* ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+      ptr = nullptr;
+    }
+  };
+  release_device_ptr(anchor_sizes_device_);
+  release_device_ptr(aspect_ratios_device_);
+  release_device_ptr(stride_device_);
+  release_device_ptr(variances_device_);
+}
+
+AnchorGeneratorPlugin::AnchorGeneratorPlugin(const void* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchor_sizes_);
+  DeserializeValue(&data, &length, &aspect_ratios_);
+  DeserializeValue(&data, &length, &stride_);
+  DeserializeValue(&data, &length, &variances_);
+  DeserializeValue(&data, &length, &offset_);
+  DeserializeValue(&data, &length, &height_);
+  DeserializeValue(&data, &length, &width_);
+  DeserializeValue(&data, &length, &num_anchors_);
+  DeserializeValue(&data, &length, &box_num_);
+  PrepareParamsOnDevice();
+}
+
+const char* AnchorGeneratorPlugin::getPluginType() const {
+  return "anchor_generator_plugin";
+}
+
+const char* AnchorGeneratorPlugin::getPluginVersion() const { return "1"; }
+
+int AnchorGeneratorPlugin::getNbOutputs() const { return 2; }
+
+nvinfer1::Dims AnchorGeneratorPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputs, int nb_input_dims) {
+  nvinfer1::Dims dims{};
+  dims.nbDims = 4;
+  dims.d[0] = height_;
+  dims.d[1] = width_;
+  dims.d[2] = num_anchors_;
+  dims.d[3] = 4;
+  return dims;
+}
+
+bool AnchorGeneratorPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::TensorFormat format) const {
+  // static shape plugin can't support different type between input/out
+  // it may cause addition overhead in half mode
+  return (type == data_type_ && format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const {
+  return 0;
+}
+
+template <typename T>
+int AnchorGeneratorPlugin::enqueue_impl(int batch_size,
+                                        const void* const* inputs,
+                                        void** outputs, void* workspace,
+                                        cudaStream_t stream) {
+  const int block = 512;
+  const int gen_anchor_grid = (box_num_ + block - 1) / block;
+  T* anchors = static_cast<T*>(outputs[0]);
+  T* vars = static_cast<T*>(outputs[1]);
+  const T* anchor_sizes_device = static_cast<const T*>(anchor_sizes_device_);
+  const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
+  const T* stride_device = static_cast<const T*>(stride_device_);
+  const T* variances_device = static_cast<const T*>(variances_device_);
+  paddle::operators::GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(
+      anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device,
+      anchor_sizes_.size(), stride_device, stride_.size(), height_, width_,
+      offset_);
+  const int var_grid = (box_num_ * 4 + block - 1) / block;
+  paddle::operators::SetVariance<T><<<var_grid, block, 0, stream>>>(
+      vars, variances_device, variances_.size(), box_num_ * 4);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int AnchorGeneratorPlugin::enqueue(int batch_size, const void* const* inputs,
+                                   void** outputs, void* workspace,
+                                   cudaStream_t stream) {
+  return enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
+}
+
+int AnchorGeneratorPlugin::initialize() { return 0; }
+
+void AnchorGeneratorPlugin::terminate() {}
+
+size_t AnchorGeneratorPlugin::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchor_sizes_);
+  serialize_size += SerializedSize(aspect_ratios_);
+  serialize_size += SerializedSize(stride_);
+  serialize_size += SerializedSize(variances_);
+  serialize_size += SerializedSize(offset_);
+  serialize_size += SerializedSize(height_);
+  serialize_size += SerializedSize(width_);
+  serialize_size += SerializedSize(num_anchors_);
+  serialize_size += SerializedSize(box_num_);
+  return serialize_size;
+}
+
+void AnchorGeneratorPlugin::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchor_sizes_);
+  SerializeValue(&buffer, aspect_ratios_);
+  SerializeValue(&buffer, stride_);
+  SerializeValue(&buffer, variances_);
+  SerializeValue(&buffer, offset_);
+  SerializeValue(&buffer, height_);
+  SerializeValue(&buffer, width_);
+  SerializeValue(&buffer, num_anchors_);
+  SerializeValue(&buffer, box_num_);
+}
+
+void AnchorGeneratorPlugin::destroy() {}
+
+void AnchorGeneratorPlugin::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPlugin::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+nvinfer1::DataType AnchorGeneratorPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_type, int nb_inputs) const {
+  return data_type_;
+}
+
+bool AnchorGeneratorPlugin::isOutputBroadcastAcrossBatch(
+    int output_index, const bool* input_is_broadcast, int nb_inputs) const {
+  return true;
+}
+
+bool AnchorGeneratorPlugin::canBroadcastInputAcrossBatch(
+    int input_index) const {
+  return false;
+}
+
+void AnchorGeneratorPlugin::configurePlugin(
+    const nvinfer1::Dims* input_dims, int nb_inputs,
+    const nvinfer1::Dims* output_dims, int nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int max_batct_size) {}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const {
+  auto plugin = new AnchorGeneratorPlugin(
+      data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_,
+      height_, width_, num_anchors_, box_num_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+void AnchorGeneratorPluginCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginName() const {
+  return "anchor_generator_plugin";
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+AnchorGeneratorPluginCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+  int type_id = -1;
+  std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
+  float offset = .5;
+  int height = -1, width = -1;
+  int num_anchors = -1;
+  int box_num = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    const auto length = fc->fields[i].length;
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchor_sizes")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      anchor_sizes.insert(anchor_sizes.end(), data, data + length);
+    } else if (field_name.compare("aspect_ratios")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      aspect_ratios.insert(aspect_ratios.end(), data, data + length);
+    } else if (field_name.compare("stride")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      stride.insert(stride.end(), data, data + length);
+    } else if (field_name.compare("variances")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      variances.insert(variances.end(), data, data + length);
+    } else if (field_name.compare("offset")) {
+      offset = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("height")) {
+      height = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("width")) {
+      width = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("num_anchors")) {
+      num_anchors = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("box_num")) {
+      box_num = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+  return new AnchorGeneratorPlugin(nvinfer1::DataType::kFLOAT, anchor_sizes,
+                                   aspect_ratios, stride, variances, offset,
+                                   height, width, num_anchors, box_num);
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new AnchorGeneratorPlugin(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+#if IS_TRT_VERSION_GE(6000)
+AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(
+    const nvinfer1::DataType data_type, const std::vector<float>& anchor_sizes,
+    const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+    const std::vector<float>& variances, const float offset,
+    const int num_anchors)
+    : data_type_(data_type),
+      anchor_sizes_(anchor_sizes),
+      aspect_ratios_(aspect_ratios),
+      stride_(stride),
+      variances_(variances),
+      offset_(offset),
+      num_anchors_(num_anchors) {
+  // data_type_ is used to determine the output data type
+  // data_type_ can only be float32
+  // height, width, num_anchors are calculated at configurePlugin
+  PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts float32."));
+  PADDLE_ENFORCE_GE(
+      num_anchors_, 0,
+      platform::errors::InvalidArgument(
+          "TRT anchor generator plugin only accepts number of anchors greater "
+          "than 0, but receive number of anchors = %d.",
+          num_anchors_));
+  PrepareParamsOnDevice();
+}
+
+AnchorGeneratorPluginDynamic::~AnchorGeneratorPluginDynamic() {
+  auto release_device_ptr = [](void* ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+      ptr = nullptr;
+    }
+  };
+  release_device_ptr(anchor_sizes_device_);
+  release_device_ptr(aspect_ratios_device_);
+  release_device_ptr(stride_device_);
+  release_device_ptr(variances_device_);
+}
+
+AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(void const* data,
+                                                           size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchor_sizes_);
+  DeserializeValue(&data, &length, &aspect_ratios_);
+  DeserializeValue(&data, &length, &stride_);
+  DeserializeValue(&data, &length, &variances_);
+  DeserializeValue(&data, &length, &offset_);
+  DeserializeValue(&data, &length, &num_anchors_);
+  PrepareParamsOnDevice();
+}
+
+nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const {
+  auto plugin = new AnchorGeneratorPluginDynamic(
+      data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_,
+      num_anchors_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs AnchorGeneratorPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs ret{};
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[2];  // feature height
+  ret.d[1] = inputs[0].d[3];  // feature width
+  ret.d[2] = exprBuilder.constant(num_anchors_);
+  ret.d[3] = exprBuilder.constant(4);
+  return ret;
+}
+
+bool AnchorGeneratorPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  // input can be any, doesn't matter
+  // anchor generator doesn't read input raw data, only need the shape info
+  auto type = inOut[pos].type;
+  auto format = inOut[pos].format;
+#if IS_TRT_VERSION_GE(7234)
+  if (pos == 0) return true;
+#else
+  if (pos == 0) return format == nvinfer1::TensorFormat::kLINEAR;
+#endif
+  return (type == nvinfer1::DataType::kFLOAT &&
+          format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+void AnchorGeneratorPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t AnchorGeneratorPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return 0;
+}
+
+template <typename T>
+int AnchorGeneratorPluginDynamic::enqueue_impl(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  const int height = inputDesc[0].dims.d[2];
+  const int width = inputDesc[0].dims.d[3];
+  const int box_num = height * width * num_anchors_;
+  const int block = 512;
+  const int gen_anchor_grid = (box_num + block - 1) / block;
+  T* anchors = static_cast<T*>(outputs[0]);
+  T* vars = static_cast<T*>(outputs[1]);
+  const T* anchor_sizes_device = static_cast<const T*>(anchor_sizes_device_);
+  const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
+  const T* stride_device = static_cast<const T*>(stride_device_);
+  const T* variances_device = static_cast<const T*>(variances_device_);
+  paddle::operators::GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(
+      anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device,
+      anchor_sizes_.size(), stride_device, stride_.size(), height, width,
+      offset_);
+  const int var_grid = (box_num * 4 + block - 1) / block;
+  paddle::operators::SetVariance<T><<<var_grid, block, 0, stream>>>(
+      vars, variances_device, variances_.size(), box_num * 4);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int AnchorGeneratorPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT);
+  assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT);
+  return enqueue_impl<float>(inputDesc, outputDesc, inputs, outputs, workspace,
+                             stream);
+}
+
+nvinfer1::DataType AnchorGeneratorPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return data_type_;
+}
+
+const char* AnchorGeneratorPluginDynamic::getPluginType() const {
+  return "anchor_generator_plugin_dynamic";
+}
+
+int AnchorGeneratorPluginDynamic::getNbOutputs() const { return 2; }
+
+int AnchorGeneratorPluginDynamic::initialize() { return 0; }
+
+void AnchorGeneratorPluginDynamic::terminate() {}
+
+size_t AnchorGeneratorPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchor_sizes_);
+  serialize_size += SerializedSize(aspect_ratios_);
+  serialize_size += SerializedSize(stride_);
+  serialize_size += SerializedSize(variances_);
+  serialize_size += SerializedSize(offset_);
+  serialize_size += SerializedSize(num_anchors_);
+  return serialize_size;
+}
+
+void AnchorGeneratorPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchor_sizes_);
+  SerializeValue(&buffer, aspect_ratios_);
+  SerializeValue(&buffer, stride_);
+  SerializeValue(&buffer, variances_);
+  SerializeValue(&buffer, offset_);
+  SerializeValue(&buffer, num_anchors_);
+}
+
+void AnchorGeneratorPluginDynamic::destroy() {}
+
+void AnchorGeneratorPluginDynamicCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginName() const {
+  return "anchor_generator_plugin_dynamic";
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+AnchorGeneratorPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+  int type_id = -1;
+  std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
+  float offset = .5;
+  int num_anchors = -1;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    const auto length = fc->fields[i].length;
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchor_sizes")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      anchor_sizes.insert(anchor_sizes.end(), data, data + length);
+    } else if (field_name.compare("aspect_ratios")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      aspect_ratios.insert(aspect_ratios.end(), data, data + length);
+    } else if (field_name.compare("stride")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      stride.insert(stride.end(), data, data + length);
+    } else if (field_name.compare("variances")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      variances.insert(variances.end(), data, data + length);
+    } else if (field_name.compare("offset")) {
+      offset = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("num_anchors")) {
+      num_anchors = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+  return new AnchorGeneratorPluginDynamic(nvinfer1::DataType::kFLOAT,
+                                          anchor_sizes, aspect_ratios, stride,
+                                          variances, offset, num_anchors);
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new AnchorGeneratorPluginDynamic(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
new file mode 100644
index 00000000000000..aff0b6a6802f11
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
@@ -0,0 +1,201 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext {
+ public:
+  explicit AnchorGeneratorPlugin(
+      const nvinfer1::DataType, const std::vector<float>& anchor_sizes,
+      const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+      const std::vector<float>& variances, const float offset, const int height,
+      const int width, const int num_anchors, const int box_num);
+  AnchorGeneratorPlugin(const void* data, size_t length);
+  ~AnchorGeneratorPlugin() override;
+  const char* getPluginType() const override;
+  const char* getPluginVersion() const override;
+  int getNbOutputs() const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::TensorFormat format) const override;
+  size_t getWorkspaceSize(int max_batch_size) const override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_type,
+                                       int nb_inputs) const override;
+  bool isOutputBroadcastAcrossBatch(int output_index,
+                                    const bool* input_is_broadcast,
+                                    int nb_inputs) const override;
+  bool canBroadcastInputAcrossBatch(int input_index) const override;
+  void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
+                       const nvinfer1::Dims* output_dims, int nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int max_batct_size) override;
+  nvinfer1::IPluginV2Ext* clone() const override;
+
+ private:
+  template <typename T>
+  int enqueue_impl(int batch_size, const void* const* inputs, void** outputs,
+                   void* workspace, cudaStream_t stream);
+  nvinfer1::DataType data_type_;
+  std::vector<float> anchor_sizes_;
+  std::vector<float> aspect_ratios_;
+  std::vector<float> stride_;
+  std::vector<float> variances_;
+  float offset_;
+  void* anchor_sizes_device_;
+  void* aspect_ratios_device_;
+  void* stride_device_;
+  void* variances_device_;
+  int height_;
+  int width_;
+  int num_anchors_;
+  int box_num_;
+  std::string namespace_;
+};
+
+class AnchorGeneratorPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  AnchorGeneratorPluginCreator() = default;
+  ~AnchorGeneratorPluginCreator() override = default;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginCreator);
+
+#if IS_TRT_VERSION_GE(6000)
+class AnchorGeneratorPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit AnchorGeneratorPluginDynamic(const nvinfer1::DataType data_type,
+                                        const std::vector<float>& anchor_sizes,
+                                        const std::vector<float>& aspect_ratios,
+                                        const std::vector<float>& stride,
+                                        const std::vector<float>& variances,
+                                        const float offset,
+                                        const int num_anchors);
+  AnchorGeneratorPluginDynamic(void const* data, size_t length);
+  ~AnchorGeneratorPluginDynamic();
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  template <typename T>
+  int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc,
+                   const nvinfer1::PluginTensorDesc* outputDesc,
+                   const void* const* inputs, void* const* outputs,
+                   void* workspace, cudaStream_t stream);
+  nvinfer1::DataType data_type_;
+  std::vector<float> anchor_sizes_;
+  std::vector<float> aspect_ratios_;
+  std::vector<float> stride_;
+  std::vector<float> variances_;
+  float offset_;
+  void* anchor_sizes_device_;
+  void* aspect_ratios_device_;
+  void* stride_device_;
+  void* variances_device_;
+  int num_anchors_;
+  std::string namespace_;
+};
+
+class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  AnchorGeneratorPluginDynamicCreator() = default;
+  ~AnchorGeneratorPluginDynamicCreator() override = default;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
index 49212aae9aa90d..75a1dd85f0f2c4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -144,9 +144,9 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
   int axis_;
 };
 
-class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator {
+class ElementwisePluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  ElementwisePluginV2Creator() {}
+  ElementwisePluginDynamicCreator() {}
   const char* getPluginName() const override { return "elementwise_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -182,7 +182,7 @@ class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(ElementwisePluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(ElementwisePluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 6c8381a750cba9..7de84a8fc49bcc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -306,9 +306,10 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   }
 };
 
-class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
+class EmbEltwiseLayernormPluginDynamicCreator
+    : public nvinfer1::IPluginCreator {
  public:
-  EmbEltwiseLayernormPluginV2Creator() {}
+  EmbEltwiseLayernormPluginDynamicCreator() {}
   const char* getPluginName() const override {
     return "fused_embedding_eltwise_layernorm_plugin";
   }
@@ -345,7 +346,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginDynamicCreator);
 
 #endif
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
index 979f600a3a9cea..23e507ee477e1a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@@ -115,9 +115,9 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
   void destroy() override { delete this; }
 };
 
-class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
+class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  GeluPluginV2Creator() {}
+  GeluPluginDynamicCreator() {}
   const char* getPluginName() const override { return "gelu_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -153,7 +153,7 @@ class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(GeluPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
index b852f5a454c07c..7147d9855755be 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@@ -118,9 +118,9 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
   float scale_;
 };
 
-class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
+class QkvToContextPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  QkvToContextPluginV2Creator() {}
+  QkvToContextPluginDynamicCreator() {}
   const char* getPluginName() const override { return "qkv_to_context_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -155,7 +155,7 @@ class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
-REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(QkvToContextPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
index 42c0df41a1b5ef..6e7ed0054f502e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -364,6 +364,7 @@ RoiAlignPluginDynamicCreator::getFieldNames() {
 nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin(
     const char* name, const nvinfer1::PluginFieldCollection* fc) {
   const nvinfer1::PluginField* fields = fc->fields;
+  return nullptr;
 }
 
 nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin(
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index 0e457fdc8f4474..ac621784550f2f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -119,9 +119,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
   float eps_;
 };
 
-class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
+class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SkipLayerNormPluginV2Creator() {}
+  SkipLayerNormPluginDynamicCreator() {}
   const char* getPluginName() const override { return "skip_layernorm_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -156,7 +156,7 @@ class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
-REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 340406c5e7fae8..9d4f9a35c3b6fe 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -121,9 +121,9 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
   cudaStream_t copy_stream_;
 };
 
-class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
+class SlicePluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SlicePluginV2Creator() {}
+  SlicePluginDynamicCreator() {}
   const char* getPluginName() const override { return "slice_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -155,7 +155,7 @@ class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SlicePluginDynamicCreator);
 
 #endif
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index e43b57357fb64f..1ee895154d6b04 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -193,9 +193,9 @@ class SplitPluginDynamic : public DynamicPluginTensorRT {
   std::vector<int> output_length_;
 };
 
-class SplitPluginV2Creator : public nvinfer1::IPluginCreator {
+class SplitPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SplitPluginV2Creator() {}
+  SplitPluginDynamicCreator() {}
   const char* getPluginName() const override { return "split_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -231,7 +231,7 @@ class SplitPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SplitPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
index 85cc6916238fef..11579aadcc4573 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
@@ -127,9 +127,9 @@ class SwishPluginDynamic : public DynamicPluginTensorRT {
   float beta_;
 };
 
-class SwishPluginV2Creator : public nvinfer1::IPluginCreator {
+class SwishPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SwishPluginV2Creator() {}
+  SwishPluginDynamicCreator() {}
   const char* getPluginName() const override { return "swish_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -165,7 +165,7 @@ class SwishPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SwishPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
new file mode 100644
index 00000000000000..13d07e774036a4
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -0,0 +1,401 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cassert>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type,
+                             const std::vector<int>& anchors,
+                             const int class_num, const float conf_thresh,
+                             const int downsample_ratio, const bool clip_bbox,
+                             const float scale_x_y, const int input_h,
+                             const int input_w)
+    : data_type_(data_type),
+      class_num_(class_num),
+      conf_thresh_(conf_thresh),
+      downsample_ratio_(downsample_ratio),
+      clip_bbox_(clip_bbox),
+      scale_x_y_(scale_x_y),
+      input_h_(input_h),
+      input_w_(input_w) {
+  anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend());
+  assert(data_type_ == nvinfer1::DataType::kFLOAT ||
+         data_type_ == nvinfer1::DataType::kHALF);
+  assert(class_num_ > 0);
+  assert(input_h_ > 0);
+  assert(input_w_ > 0);
+
+  cudaMalloc(&anchors_device_, anchors.size() * sizeof(int));
+  cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int),
+             cudaMemcpyHostToDevice);
+}
+
+YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchors_);
+  DeserializeValue(&data, &length, &class_num_);
+  DeserializeValue(&data, &length, &conf_thresh_);
+  DeserializeValue(&data, &length, &downsample_ratio_);
+  DeserializeValue(&data, &length, &clip_bbox_);
+  DeserializeValue(&data, &length, &scale_x_y_);
+  DeserializeValue(&data, &length, &input_h_);
+  DeserializeValue(&data, &length, &input_w_);
+}
+
+YoloBoxPlugin::~YoloBoxPlugin() {
+  if (anchors_device_ != nullptr) {
+    cudaFree(anchors_device_);
+    anchors_device_ = nullptr;
+  }
+}
+
+const char* YoloBoxPlugin::getPluginType() const { return "yolo_box_plugin"; }
+
+const char* YoloBoxPlugin::getPluginVersion() const { return "1"; }
+
+int YoloBoxPlugin::getNbOutputs() const { return 2; }
+
+nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(int index,
+                                                  const nvinfer1::Dims* inputs,
+                                                  int nb_input_dims) {
+  const int anchor_num = anchors_.size() / 2;
+  const int box_num = inputs[0].d[1] * inputs[0].d[2] * anchor_num;
+
+  assert(index <= 1);
+
+  if (index == 0) {
+    return nvinfer1::Dims2(box_num, 4);
+  }
+  return nvinfer1::Dims2(box_num, class_num_);
+}
+
+bool YoloBoxPlugin::supportsFormat(nvinfer1::DataType type,
+                                   nvinfer1::TensorFormat format) const {
+  return ((type == data_type_ || type == nvinfer1::DataType::kINT32) &&
+          format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+size_t YoloBoxPlugin::getWorkspaceSize(int max_batch_size) const { return 0; }
+
+template <typename T>
+__device__ inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <>
+__device__ inline float sigmoid(float x) {
+  return 1.f / (1.f + expf(-x));
+}
+
+template <typename T>
+__device__ inline void GetYoloBox(float* box, const T* x, const int* anchors,
+                                  int i, int j, int an_idx, int grid_size_h,
+                                  int grid_size_w, int input_size_h,
+                                  int input_size_w, int index, int stride,
+                                  int img_height, int img_width, float scale,
+                                  float bias) {
+  box[0] = static_cast<float>(
+      (i + sigmoid(static_cast<float>(x[index]) * scale + bias)) * img_width /
+      grid_size_w);
+  box[1] = static_cast<float>(
+      (j + sigmoid(static_cast<float>(x[index + stride]) * scale + bias)) *
+      img_height / grid_size_h);
+  box[2] = static_cast<float>(expf(static_cast<float>(x[index + 2 * stride])) *
+                              anchors[2 * an_idx] * img_width / input_size_w);
+  box[3] =
+      static_cast<float>(expf(static_cast<float>(x[index + 3 * stride])) *
+                         anchors[2 * an_idx + 1] * img_height / input_size_h);
+}
+
+__device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+__device__ inline void CalcDetectionBox(T* boxes, const float* box,
+                                        const int box_idx, const int img_height,
+                                        const int img_width, bool clip_bbox) {
+  float tmp_box_0, tmp_box_1, tmp_box_2, tmp_box_3;
+  tmp_box_0 = box[0] - box[2] / 2;
+  tmp_box_1 = box[1] - box[3] / 2;
+  tmp_box_2 = box[0] + box[2] / 2;
+  tmp_box_3 = box[1] + box[3] / 2;
+
+  if (clip_bbox) {
+    tmp_box_0 = max(tmp_box_0, 0.f);
+    tmp_box_1 = max(tmp_box_1, 0.f);
+    tmp_box_2 = min(tmp_box_2, static_cast<float>(img_width - 1));
+    tmp_box_3 = min(tmp_box_3, static_cast<float>(img_height - 1));
+  }
+
+  boxes[box_idx + 0] = static_cast<T>(tmp_box_0);
+  boxes[box_idx + 1] = static_cast<T>(tmp_box_1);
+  boxes[box_idx + 2] = static_cast<T>(tmp_box_2);
+  boxes[box_idx + 3] = static_cast<T>(tmp_box_3);
+}
+
+template <typename T>
+__device__ inline void CalcLabelScore(T* scores, const T* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const float conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = static_cast<T>(
+        conf * sigmoid(static_cast<float>(input[label_idx + i * stride])));
+  }
+}
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* const input, const int* const imgsize,
+                            T* boxes, T* scores, const float conf_thresh,
+                            const int* anchors, const int n, const int h,
+                            const int w, const int an_num, const int class_num,
+                            const int box_num, int input_size_h,
+                            int input_size_w, bool clip_bbox, const float scale,
+                            const float bias) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  float box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    float conf = sigmoid(static_cast<float>(input[obj_idx]));
+    int box_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+
+    if (conf < conf_thresh) {
+      for (int i = 0; i < 4; ++i) {
+        box[i] = 0.f;
+      }
+    } else {
+      GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
+                    input_size_w, box_idx, grid_num, img_height, img_width,
+                    scale, bias);
+    }
+
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
+
+    int label_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
+                      grid_num);
+  }
+}
+
+template <typename T>
+int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs,
+                                void** outputs, void* workspace,
+                                cudaStream_t stream) {
+  const int n = batch_size;
+  const int h = input_h_;
+  const int w = input_w_;
+  const int an_num = anchors_.size() / 2;
+  const int box_num = h * w * an_num;
+  int input_size_h = downsample_ratio_ * h;
+  int input_size_w = downsample_ratio_ * w;
+
+  float bias = -0.5 * (scale_x_y_ - 1.);
+  constexpr int threads = 256;
+
+  KeYoloBoxFw<T><<<(n * box_num + threads - 1) / threads, threads, 0, stream>>>(
+      reinterpret_cast<const T* const>(inputs[0]),
+      reinterpret_cast<const int* const>(inputs[1]),
+      reinterpret_cast<T*>(outputs[0]), reinterpret_cast<T*>(outputs[1]),
+      conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num,
+      input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs,
+                           void** outputs, void* workspace,
+                           cudaStream_t stream) {
+  if (data_type_ == nvinfer1::DataType::kFLOAT) {
+    return enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
+  } else if (data_type_ == nvinfer1::DataType::kHALF) {
+    return enqueue_impl<half>(batch_size, inputs, outputs, workspace, stream);
+  }
+  assert("unsupported type.");
+}
+
+int YoloBoxPlugin::initialize() { return 0; }
+
+void YoloBoxPlugin::terminate() {}
+
+size_t YoloBoxPlugin::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchors_);
+  serialize_size += SerializedSize(class_num_);
+  serialize_size += SerializedSize(conf_thresh_);
+  serialize_size += SerializedSize(downsample_ratio_);
+  serialize_size += SerializedSize(clip_bbox_);
+  serialize_size += SerializedSize(scale_x_y_);
+  serialize_size += SerializedSize(input_h_);
+  serialize_size += SerializedSize(input_w_);
+  return serialize_size;
+}
+
+void YoloBoxPlugin::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchors_);
+  SerializeValue(&buffer, class_num_);
+  SerializeValue(&buffer, conf_thresh_);
+  SerializeValue(&buffer, downsample_ratio_);
+  SerializeValue(&buffer, clip_bbox_);
+  SerializeValue(&buffer, scale_x_y_);
+  SerializeValue(&buffer, input_h_);
+  SerializeValue(&buffer, input_w_);
+}
+
+void YoloBoxPlugin::destroy() {}
+
+void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* YoloBoxPlugin::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+nvinfer1::DataType YoloBoxPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_type, int nb_inputs) const {
+  return data_type_;
+}
+
+bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(int output_index,
+                                                 const bool* input_is_broadcast,
+                                                 int nb_inputs) const {
+  return false;
+}
+
+bool YoloBoxPlugin::canBroadcastInputAcrossBatch(int input_index) const {
+  return false;
+}
+
+void YoloBoxPlugin::configurePlugin(
+    const nvinfer1::Dims* input_dims, int nb_inputs,
+    const nvinfer1::Dims* output_dims, int nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int max_batct_size) {}
+
+nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const {
+  return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_,
+                           downsample_ratio_, clip_bbox_, scale_x_y_, input_h_,
+                           input_w_);
+}
+
+YoloBoxPluginCreator::YoloBoxPluginCreator() {}
+
+void YoloBoxPluginCreator::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* YoloBoxPluginCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* YoloBoxPluginCreator::getPluginName() const {
+  return "yolo_box_plugin";
+}
+
+const char* YoloBoxPluginCreator::getPluginVersion() const { return "1"; }
+
+const nvinfer1::PluginFieldCollection* YoloBoxPluginCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+
+  int type_id = -1;
+  std::vector<int> anchors;
+  int class_num = -1;
+  float conf_thresh = 0.01;
+  int downsample_ratio = 32;
+  bool clip_bbox = true;
+  float scale_x_y = 1.;
+  int h = -1;
+  int w = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchors")) {
+      const int length = fc->fields[i].length;
+      const int* data = static_cast<const int*>(fc->fields[i].data);
+      anchors.insert(anchors.end(), data, data + length);
+    } else if (field_name.compare("class_num")) {
+      class_num = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("conf_thresh")) {
+      conf_thresh = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("downsample_ratio")) {
+      downsample_ratio = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("clip_bbox")) {
+      clip_bbox = *static_cast<const bool*>(fc->fields[i].data);
+    } else if (field_name.compare("scale_x_y")) {
+      scale_x_y = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("h")) {
+      h = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("w")) {
+      w = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+
+  return new YoloBoxPlugin(
+      type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors,
+      class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w);
+}
+
+nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new YoloBoxPlugin(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
new file mode 100644
index 00000000000000..8ca21da7ae0377
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class YoloBoxPlugin : public nvinfer1::IPluginV2Ext {
+ public:
+  explicit YoloBoxPlugin(const nvinfer1::DataType data_type,
+                         const std::vector<int>& anchors, const int class_num,
+                         const float conf_thresh, const int downsample_ratio,
+                         const bool clip_bbox, const float scale_x_y,
+                         const int input_h, const int input_w);
+  YoloBoxPlugin(const void* data, size_t length);
+  ~YoloBoxPlugin() override;
+
+  const char* getPluginType() const override;
+  const char* getPluginVersion() const override;
+  int getNbOutputs() const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::TensorFormat format) const override;
+  size_t getWorkspaceSize(int max_batch_size) const override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+  template <typename T>
+  int enqueue_impl(int batch_size, const void* const* inputs, void** outputs,
+                   void* workspace, cudaStream_t stream);
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_type,
+                                       int nb_inputs) const override;
+  bool isOutputBroadcastAcrossBatch(int output_index,
+                                    const bool* input_is_broadcast,
+                                    int nb_inputs) const override;
+  bool canBroadcastInputAcrossBatch(int input_index) const override;
+  void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
+                       const nvinfer1::Dims* output_dims, int nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int max_batct_size) override;
+  nvinfer1::IPluginV2Ext* clone() const override;
+
+ private:
+  nvinfer1::DataType data_type_;
+  std::vector<int> anchors_;
+  int* anchors_device_;
+  int class_num_;
+  float conf_thresh_;
+  int downsample_ratio_;
+  bool clip_bbox_;
+  float scale_x_y_;
+  int input_h_;
+  int input_w_;
+  std::string namespace_;
+};
+
+class YoloBoxPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  YoloBoxPluginCreator();
+  ~YoloBoxPluginCreator() override = default;
+
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(YoloBoxPluginCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index 6d4bb70df6f3ad..9211ea246a5c5e 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -75,14 +75,15 @@ int test_predictor_zero_copy(const AnalysisConfig& config_in,
   }
 
   std::vector<float> input({1});
-  auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())};
+  auto in_tensor =
+      predictor->GetInputTensor(predictor->GetInputNames().front());
   in_tensor->Reshape({1, 1});
   in_tensor->copy_from_cpu(input.data());
 
   predictor->ZeroCopyRun();
 
-  auto out_tensor{
-      predictor->GetOutputTensor(predictor->GetOutputNames().front())};
+  auto out_tensor =
+      predictor->GetOutputTensor(predictor->GetOutputNames().front());
   std::vector<float> data_o(10);
   out_tensor->copy_to_cpu(data_o.data());
 
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 377ea376773899..2ea047fa13c105 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -27,12 +27,18 @@ if (WITH_ROCM)
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()
 
+if (WITH_ASCEND_CL)
+  cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
+endif()
+
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 if (WITH_GPU OR WITH_ROCM)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
 elseif(WITH_XPU)
     set(AllocatorFacadeDeps xpu_info)
+elseif(WITH_ASCEND)
+    set(AllocatorFacadeDeps ascend_npu_info)
 else ()
     set(AllocatorFacadeDeps)
 endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index cbeb263b5f41b9..730efa5c646885 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -32,6 +32,7 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
+#include "paddle/fluid/platform/npu_info.h"
 
 DEFINE_int64(
     gpu_allocator_retry_time, 10000,
@@ -66,6 +67,11 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
 #endif
         break;
       }
@@ -185,6 +191,12 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
   class ZeroSizeAllocator : public Allocator {
    public:
     explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 0ada2cafcc16a6..3e88d61783c9e6 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -19,7 +19,10 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #include "paddle/fluid/string/printf.h"
@@ -110,6 +113,7 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+// For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 #ifdef PADDLE_WITH_XPU
@@ -219,6 +223,135 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #endif
 }
 
+// For Ascend NPU
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUBuddyAllocatorList {
+ private:
+  NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
+    auto npu_num = devices_.size();
+    allocators_.resize(npu_num);
+    init_flags_.reserve(npu_num);
+    for (size_t i = 0; i < npu_num; ++i) {
+      init_flags_.emplace_back(new std::once_flag());
+    }
+  }
+
+  static NPUBuddyAllocatorList *CreateNewInstance() {
+    return new NPUBuddyAllocatorList();
+  }
+
+ public:
+  static NPUBuddyAllocatorList *Instance() {
+    static auto *instance = CreateNewInstance();
+    return instance;
+  }
+
+  BuddyAllocator *Get(int npu_id) {
+    auto pos = std::distance(
+        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
+    PADDLE_ENFORCE_LT(pos, devices_.size(),
+                      platform::errors::OutOfRange(
+                          "The index exceeds the size of devices, the size of "
+                          "devices is %d, the index is %d",
+                          devices_.size(), pos));
+
+    std::call_once(*init_flags_[pos], [this, pos] {
+      platform::SetNPUDeviceId(devices_[pos]);
+      allocators_[pos].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::NPUAllocator(devices_[pos])),
+          platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
+    });
+
+    return allocators_[pos].get();
+  }
+
+ private:
+  std::vector<int> devices_;
+  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
+  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
+  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
+}
+#endif
+
+template <>
+size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
+  auto *ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    platform::NPUDeviceGuard(place.device);
+    size_t avail, total;
+    platform::NPUMemoryUsage(&avail, &total);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
+        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
+        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
+        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
+        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      aclrtMemset(ptr, size, 0xEF, size);
+    }
+  }
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetNPUBuddyAllocator(place.device)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+// For CUDA
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUBuddyAllocatorList {
  private:
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
index 37da748ee9c965..1fe85dd699acf1 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -61,6 +61,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  sleep(10);
+  alloc.Release(platform::NPUPlace(0));
+
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::NPUPlace(0));
+}
+#endif
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
new file mode 100644
index 00000000000000..faf7ae6221caaf
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/npu_allocator.h"
+#include <string>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool NPUAllocator::IsAllocThreadSafe() const { return true; }
+void NPUAllocator::FreeImpl(Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "NPU memory is freed in incorrect device. This may be a bug"));
+  platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
+                            place_.device);
+  delete allocation;
+}
+
+Allocation* NPUAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_,
+                 [this] { platform::SetNPUDeviceId(place_.device); });
+
+  void* ptr;
+  auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
+  if (LIKELY(result == ACL_ERROR_NONE)) {
+    return new Allocation(ptr, size, platform::Place(place_));
+  }
+
+  size_t avail, total, actual_avail, actual_total;
+  bool is_limited = platform::RecordedNPUMemGetInfo(
+      &avail, &total, &actual_avail, &actual_total, place_.device);
+
+  std::string err_msg;
+  if (is_limited) {
+    auto limit_size = (total >> 20);
+    err_msg = string::Sprintf(
+        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
+        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
+        "GPU memory usage is limited to %d MB.\n"
+        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+        limit_size, limit_size);
+  }
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on NPU %d. "
+      "Cannot allocate %s memory on NPU %d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using NPU %d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+      "2. If no, please decrease the batch size of your model. %s\n\n",
+      place_.device, string::HumanReadableSize(size), place_.device,
+      string::HumanReadableSize(avail), place_.device, err_msg));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
new file mode 100644
index 00000000000000..bf668973505bab
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NPUAllocator : public Allocator {
+ public:
+  explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::NPUPlace place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index fcae741db3667f..e9631ee739b9b8 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -6,6 +6,8 @@ if(WITH_GPU)
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
 elseif(WITH_ROCM)
   hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
+elseif(${WITH_ASCEND_CL})
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
 else()
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
 endif()
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 50c0b58f3a1dd6..55436f451a41ff 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
 
 namespace paddle {
 namespace memory {
@@ -235,6 +238,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
     }
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
+    } else {
+      // Compute the re-allocation size, we store the re-allocation size when
+      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
+      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
+        realloc_size_ = platform::NPUReallocSize();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
+    }
+  }
+#endif
 
   // Allocate a new block
   void* p = system_allocator_->Alloc(&index, allocate_bytes);
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 15e93deffccda8..135c3b6d04f346 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 2dc3e73af24162..290f3d5d1bcd47 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
+#include <fstream>
+#include <string>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <fstream>
-#include <string>
-
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -342,6 +344,32 @@ TEST(BuddyAllocator, Release) {
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(BuddyAllocator, NpuFraction) {
+  // In a 16 GB machine, the pool size will be about 160 MB
+  FLAGS_fraction_of_gpu_memory_to_use = 0.005;
+  FLAGS_fraction_of_gpu_memory_to_use = 0.92;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
+      platform::NPUMinChunkSize(), platform::NPUMaxChunkSize());
+
+  // Less than pool size
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  buddy_allocator.Release();
+
+  // Greater than max chunk size
+  TestBuddyAllocator(&buddy_allocator, 300 << 20,
+                     /* use_system_allocator = */ true);
+  TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
+                     /* use_system_allocator = */ true);
+}
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 38baf6c24bab3f..c733ba5c68c9bd 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -29,6 +29,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -247,6 +249,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
 
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+void* NPUAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
+
+  if (result == ACL_ERROR_NONE) {
+    *index = 0;
+    npu_alloc_size_ += size;
+    return p;
+  } else {
+    size_t avail, total, actual_avail, actual_total;
+    bool is_limited = platform::RecordedNPUMemGetInfo(
+        &avail, &total, &actual_avail, &actual_total, npu_id_);
+
+    std::string err_msg;
+    if (is_limited) {
+      auto limit_size = (total >> 20);
+      err_msg = string::Sprintf(
+          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
+          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
+          "maximum GPU memory usage is limited to %d MB.\n"
+          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+          limit_size, limit_size);
+    }
+
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on NPU %d. "
+        "Cannot allocate %s memory on NPU %d, "
+        "available memory is only %s.\n\n"
+        "Please check whether there is any other process using NPU %d.\n"
+        "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+        "2. If no, please try one of the following suggestions:\n"
+        "   1) Decrease the batch size of your model.\n"
+        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
+        "please set it to a higher value but less than 1.0.\n"
+        "      The command is "
+        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
+        npu_id_, string::HumanReadableSize(size), npu_id_,
+        string::HumanReadableSize(avail), npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+  }
+}
+
+void NPUAllocator::Free(void* p, size_t size, size_t index) {
+  VLOG(4) << "Free " << p << " size " << size;
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size, npu_alloc_size_));
+  npu_alloc_size_ -= size;
+
+  platform::RecordedNPUFree(p, size, npu_id_);
+}
+
+bool NPUAllocator::UseGpu() const { return true; }
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index e332bb670da235..26711ae4070f5e 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class NPUAllocator : public SystemAllocator {
+ public:
+  explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t npu_alloc_size_ = 0;
+  int npu_id_;
+};
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index 13854d771a0bf6..ead188341dac46 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -85,3 +85,11 @@ TEST(GPUAllocator, AllocFailure) {
   }
 }
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NPUAllocator, Alloc) {
+  paddle::memory::detail::NPUAllocator a(0);
+  TestAllocator(&a, 1 << 20);
+  TestAllocator(&a, 1);
+}
+#endif
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 7f871fab5a1470..1eb0535831bb19 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -40,7 +40,7 @@ void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
@@ -86,7 +86,7 @@ void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
                                                   platform::XPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
@@ -132,7 +132,7 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
                                                   platform::XPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
@@ -196,6 +196,101 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+template <>
+void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(dst_place.device);
+
+  // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async,
+  // which is different from CUDA. In Paddle, when async is called, "sync"
+  // is run actually, which means Paddle doesn't fully supported async.
+  // TODO(ascendrc): Support NPU memcpy async for better performance.
+  stream = nullptr;
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+  } else {
+    platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::NPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(src_place.device);
+
+  // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async,
+  // which is different from CUDA. In Paddle, when async is called, "sync"
+  // is run actually, which means Paddle doesn't fully supported async.
+  // TODO(ascendrc): Support NPU memcpy async for better performance.
+  stream = nullptr;
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+  } else {
+    platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
+  }
+}
+
+template <>
+void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::NPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by stream(" << stream << ")";
+  if (dst_place == src_place) {
+    platform::SetNPUDeviceId(src_place.device);
+    if (stream) {
+      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU");
+      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+                               stream);
+    } else {
+      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
+      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
+    }
+  } else {
+    if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Peer access between NPU places is not allowed."));
+    }
+    if (stream) {
+      // TODO(zhiqiu): support peer access?
+      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU");
+      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+                               stream);
+    } else {
+      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
+      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
+    }
+  }
+}
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
index 25490f28b65987..c630437224cd09 100644
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -52,7 +52,27 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
           gpuStream_t stream);
+#endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU or NPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU or NPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ * \param[in]  stream   NPU stream.
+ *
+ * \note    For NPU memory copy, NPU stream need to be specified
+ *          for asynchronously memory copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          aclrtStream stream);
 #endif
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 467a5ff9063a65..cecc70cc6dda8e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -10,6 +10,7 @@ file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists
 copy_if_different(${pybind_file} ${pybind_file_final})
 
 add_subdirectory(math)
+add_subdirectory(eigen)
 add_subdirectory(controlflow)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
@@ -68,7 +69,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
+register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
         sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
@@ -110,8 +111,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function)
 if (WITH_GPU OR WITH_ROCM)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function)
 endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
@@ -121,6 +123,12 @@ if (WITH_ASCEND)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper)
 endif()
 
+if (WITH_ASCEND_CL)
+  cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS assign_op)
+  cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner)
+endif()
+
 # FIXME(typhoonzero): operator deps may not needed.
 # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
 # op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
@@ -134,8 +142,8 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
 set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 
 cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax)
-cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
@@ -154,12 +162,22 @@ endif()
 cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS})
 if (WITH_PYTHON)
   cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
+  cc_library(py_layer_op SRCS py_layer_op.cc DEPS op_registry python pybind)
+endif()
+
+if (WITH_ASCEND_CL)
+  cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
+  cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 add_subdirectory(benchmark)
 
 cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op)
+if (WITH_ASCEND_CL)
+    cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor)
+endif()
+
 
 if(WITH_MKLDNN)
 include(mkldnn/inplace_op_tests.cmake)
@@ -173,3 +191,7 @@ if(WITH_UNITY_BUILD)
     # The specified link dependency needs to be displayed here.
     target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS})
 endif()
+
+if(WITH_ASCEND_CL)
+cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 94f2eb3672bd5d..1cac9ed9f1dd08 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -782,6 +782,26 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("tanh_grad_grad");
+    // input1: Out
+    op->SetInput("Out", this->Input("Out"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DOutNew", this->InputGrad("Out"));
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
 template <typename T>
@@ -1041,6 +1061,34 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
+/* ==========================    tanh register  ============================= */
+REGISTER_OPERATOR(
+    tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::TanhGradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::TanhGradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    std::conditional<ops::CanInplaceAct<ops::TanhGradFunctor<float>>(),
+                     ops::ActFwdInplaceInferer, void>::type);
+REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::TanhDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::TanhDoubleGradMaker<paddle::imperative::OpBase>)
+REGISTER_OPERATOR(
+    tanh_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::TanhGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
+
+REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
+REGISTER_OP_CPU_KERNEL(
+    tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                                              ops::TanhGradGradFunctor<float>>,
+    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                              ops::TanhGradGradFunctor<double>>,
+    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                              ops::TanhGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ==========================    relu register  ============================= */
 REGISTER_OPERATOR(
     relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType,
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index c6d2fbccd8e84b..781a97c1ffcc17 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -42,6 +42,10 @@ template <typename T>
 class BaseGPUFunctor {
  public:
   using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
 };
 
 /* ========================================================================== */
@@ -57,42 +61,35 @@ class ReluGPUFunctor : public BaseGPUFunctor<T> {
 
   // for relu forward when T is double
   __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type* x);
+      const typename CudaVecType<T>::type in) {
+    // relu forward : out = max(x, 0)
+    return in > zero_ ? in : zero_;
+  }
 
   // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T x) {
-    return x > zero_ ? x : zero_;
+  __device__ __forceinline__ T ComputeRemainder(const T in) {
+    // relu forward : out = max(x, 0)
+    return in > zero_ ? in : zero_;
   }
 };
 
-template <>
-__device__ __forceinline__ CudaVecType<double>::type
-ReluGPUFunctor<double>::Compute(const CudaVecType<double>::type* x) {
-// relu forward : out = max(x, 0)
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
-  return __ldg(x) > zero_ ? __ldg(x) : zero_;
-#else
-  return (*x) > zero_ ? (*x) : zero_;
-#endif
-}
-
 template <>
 __device__ __forceinline__ CudaVecType<float>::type
-ReluGPUFunctor<float>::Compute(const CudaVecType<float>::type* xx) {
-  // relu forward : out = max(xx, 0)
-  return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y),
-                     (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w));
+ReluGPUFunctor<float>::Compute(const CudaVecType<float>::type in) {
+  // relu forward : out = max(in, 0)
+  return make_float4((in.x > zero_) * (in.x), (in.y > zero_) * (in.y),
+                     (in.z > zero_) * (in.z), (in.w > zero_) * (in.w));
 }
 
 template <>
 __device__ __forceinline__ CudaVecType<float16>::type
-ReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* in) {
+ReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type in) {
 // relu forward : out = max(in, 0)
 #ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in));
+  return __hmul2(__hgt2(in, kzero), in);
 #else
-  const float2 xx = __half22float2(*in);
+  const float2 xx = __half22float2(in);
   return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(xx.x),
                            (xx.y > 0.0f) * static_cast<float>(xx.y));
 #endif
@@ -112,8 +109,10 @@ class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
 
   // for relu backward when T is double
   __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type* out,
-      const typename CudaVecType<T>::type* dout);
+      const typename CudaVecType<T>::type out,
+      const typename CudaVecType<T>::type dout) {
+    return out > zero_ ? dout : zero_;
+  }
 
   // when num % vecsize != 0 this func will be used
   __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) {
@@ -124,44 +123,132 @@ class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
-template <>
-__device__ __forceinline__ CudaVecType<double>::type
-ReluGradGPUFunctor<double>::Compute(const CudaVecType<double>::type* out,
-                                    const CudaVecType<double>::type* dout) {
-// relu backward : dx = out > 0 ? dout : 0;
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
-  return __ldg(out) > zero_ ? __ldg(dout) : zero_;
-#else
-  return (*out) > zero_ ? (*dout) : zero_;
-#endif
-}
-
 template <>
 __device__ __forceinline__ CudaVecType<float>::type
-ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type* out,
-                                   const CudaVecType<float>::type* dout) {
+ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type out,
+                                   const CudaVecType<float>::type dout) {
   // relu backward : dx = out > 0 ? dout : 0;
-  return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y),
-                     (out->z > zero_) * (dout->z),
-                     (out->w > zero_) * (dout->w));
+  return make_float4((out.x > zero_) * (dout.x), (out.y > zero_) * (dout.y),
+                     (out.z > zero_) * (dout.z), (out.w > zero_) * (dout.w));
 }
 
 template <>
 __device__ __forceinline__ CudaVecType<float16>::type
-ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* out,
-                                     const CudaVecType<float16>::type* dout) {
+ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type out,
+                                     const CudaVecType<float16>::type dout) {
 // relu backward : dx = out > 0 ? dout : 0;
 #ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout));
+  return __hmul2(__hgt2(out, kzero), dout);
 #else
-  const float2 xx = __half22float2(*out);
-  const float2 yy = __half22float2(*dout);
+  const float2 xx = __half22float2(out);
+  const float2 yy = __half22float2(dout);
   return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(yy.x),
                            (xx.y > 0.0f) * static_cast<float>(yy.y));
 #endif
 }
 
+/* ========================================================================== */
+/* ========================    leaky relu forward    ========================
+ */
+template <typename T>
+class LeakyReluGPUFunctor : public BaseGPUFunctor<T> {
+ private:
+  T zero_;
+  float alpha_;
+
+ public:
+  LeakyReluGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha_}};
+  }
+  // leakyrelu forward : out = x > 0 ? x : x * alpha
+  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
+      const typename CudaVecType<T>::type in) {
+    return in > zero_ ? in : static_cast<T>(alpha_) * in;
+  }
+
+  __device__ __forceinline__ T ComputeRemainder(const T in) {
+    // leakyrelu forward : out = x > 0 ? x : x * alpha
+    return in > zero_ ? in : static_cast<T>(alpha_) * in;
+  }
+};
+
+template <>
+__device__ __forceinline__ CudaVecType<float>::type
+LeakyReluGPUFunctor<float>::Compute(const CudaVecType<float>::type in) {
+  // leakyrelu forward : out = x > 0 ? x : x * alpha
+  return make_float4((in.x > zero_) ? (in.x) : (in.x) * alpha_,
+                     (in.y > zero_) ? (in.y) : (in.y) * alpha_,
+                     (in.z > zero_) ? (in.z) : (in.z) * alpha_,
+                     (in.w > zero_) ? (in.w) : (in.w) * alpha_);
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float16>::type
+LeakyReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type in) {
+  // leakyrelu forward : out = x > 0 ? x : x * alpha
+  const float2 xx = __half22float2(in);
+  return __floats2half2_rn((xx.x > 0.0f) ? xx.x : xx.x * alpha_,
+                           (xx.y > 0.0f) ? xx.y : xx.y * alpha_);
+}
+/* ========================================================================== */
+
+/* ===========================  leaky relu backward   =======================
+ */
+template <typename T>
+class LeakyReluGradGPUFunctor : public BaseGPUFunctor<T> {
+ private:
+  T zero_;
+  float alpha_;
+
+ public:
+  LeakyReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha_}};
+  }
+
+  // for leaky relu backward when T is double
+  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
+      const typename CudaVecType<T>::type in,
+      const typename CudaVecType<T>::type dout) {
+    // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
+    return in > zero_ ? dout : static_cast<T>(alpha_) * dout;
+  }
+
+  // when num % vecsize != 0 this func will be used
+  __device__ __forceinline__ T ComputeRemainder(const T in, const T dout) {
+    // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
+    return in > zero_ ? dout : static_cast<T>(alpha_) * dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <>
+__device__ __forceinline__ CudaVecType<float>::type
+LeakyReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type in,
+                                        const CudaVecType<float>::type dout) {
+  // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
+  return make_float4((in.x > zero_) ? (dout.x) : alpha_ * (dout.x),
+                     (in.y > zero_) ? (dout.y) : alpha_ * (dout.y),
+                     (in.z > zero_) ? (dout.z) : alpha_ * (dout.z),
+                     (in.w > zero_) ? (dout.w) : alpha_ * (dout.w));
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float16>::type LeakyReluGradGPUFunctor<
+    float16>::Compute(const CudaVecType<float16>::type in,
+                      const CudaVecType<float16>::type dout) {
+  // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
+  const float2 xx = __half22float2(in);
+  const float2 yy = __half22float2(dout);
+  return __floats2half2_rn((xx.x > 0.0f) ? yy.x : alpha_ * yy.x,
+                           (xx.y > 0.0f) ? yy.y : alpha_ * yy.y);
+}
+
 /* ========================================================================== */
 
 template <typename T, typename Functor>
@@ -176,14 +263,23 @@ __global__ void ActivationGradKernelVec(const T* forward_data, const T* dout,
   const VecType* in_forward = reinterpret_cast<const VecType*>(forward_data);
   const VecType* in_dout = reinterpret_cast<const VecType*>(dout);
   VecType* out = reinterpret_cast<VecType*>(dx);
-
+  VecType forward_vec, dout_vec;
+  T in_data, dout_data;
   for (int i = idx; i < loop; i += stride) {
-    out[i] = functor.Compute((in_forward + i), (in_dout + i));
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
+    forward_vec = __ldg(in_forward + i);
+    dout_vec = __ldg(in_dout + i);
+#else
+    forward_vec = in_forward[i];
+    dout_vec = in_dout[i];
+#endif
+    out[i] = functor.Compute(forward_vec, dout_vec);
   }
 
   while (idx == loop && tail) {
-    dx[num - tail] =
-        functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]);
+    in_data = forward_data[num - tail];
+    dout_data = dout[num - tail];
+    dx[num - tail] = functor.ComputeRemainder(in_data, dout_data);
     --tail;
   }
 }
@@ -199,9 +295,14 @@ __global__ void ActivationkernelVec(const T* src, T* dst, int num,
   int tail = num % vecsize;
   const VecType* in = reinterpret_cast<const VecType*>(src);
   VecType* out = reinterpret_cast<VecType*>(dst);
-
+  VecType x_vec;
   for (int i = idx; i < loop; i += stride) {
-    out[i] = functor.Compute((in + i));
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
+    x_vec = __ldg(in + i);
+#else
+    x_vec = in[i];
+#endif
+    out[i] = functor.Compute(x_vec);
   }
 
   while (idx == loop && tail) {
@@ -231,6 +332,10 @@ class ActivationGPUKernel
     block = 256;
 #endif
     Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
     constexpr int vecsize = CudaVecType<T>::vecsize;
     int grid = max((num / vecsize + block - 1) / block, 1);
     auto stream = context.cuda_device_context().stream();
@@ -270,7 +375,12 @@ class ActivationGradGPUKernel
 #ifdef __HIPCC__
     block = 256;
 #endif
+
     Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
     constexpr int vecsize = CudaVecType<T>::vecsize;
     int grid = max((numel / vecsize + block - 1) / block, 1);
     auto stream = context.cuda_device_context().stream();
@@ -300,12 +410,28 @@ namespace plat = paddle::platform;
                                 ops::grad_functor<double>>,                 \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                 ops::grad_functor<plat::float16>>);
-
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
 
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor,             \
+                                       grad_functor)                           \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,  \
+                                         ops::functor<float>>,                 \
+      ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,            \
+                               ops::functor<double>>,                          \
+      ops::ActivationGPUKernel<plat::CUDADeviceContext,                        \
+                               ops::functor<plat::float16>>);                  \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad, ops::ActivationGradGPUKernel<plat::CUDADeviceContext,   \
+                                                    ops::grad_functor<float>>, \
+      ops::ActivationGradGPUKernel<plat::CUDADeviceContext,                    \
+                                   ops::grad_functor<double>>,                 \
+      ops::ActivationGradGPUKernel<plat::CUDADeviceContext,                    \
+                                   ops::grad_functor<plat::float16>>);
+
 /* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                                LeakyReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluGPUFunctor,
+                               LeakyReluGradGPUFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     leaky_relu_grad_grad,
@@ -330,21 +456,7 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
-                                   ops::ReluGPUFunctor<float>>,
-    ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
-                             ops::ReluGPUFunctor<double>>,
-    ops::ActivationGPUKernel<plat::CUDADeviceContext,
-                             ops::ReluGPUFunctor<plat::float16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                            ops::ReluGradGPUFunctor<float>>,
-    ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                 ops::ReluGradGPUFunctor<double>>,
-    ops::ActivationGradGPUKernel<plat::CUDADeviceContext,
-                                 ops::ReluGradGPUFunctor<plat::float16>>);
+REGISTER_ACTIVATION_GPU_KERNEL(relu, Relu, ReluGPUFunctor, ReluGradGPUFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
@@ -356,6 +468,19 @@ REGISTER_OP_CUDA_KERNEL(
                                     ops::ReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
+/* ===========================    tanh register  ============================ */
+REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    tanh_grad_grad,
+    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<float>>,
+    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<double>>,
+    ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ===========================   sqrt register  ============================= */
 REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index bc7def61b2e249..fb9f956f17c0b1 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -366,6 +366,36 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+template <typename T>
+struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* Out,
+                  const framework::Tensor* ddX, const framework::Tensor* dOut,
+                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
+    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
+    // * ddx)
+    if (dOutNew) {
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
+      auto dout_new = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad"));
+      dout_new.device(*d) =
+          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
 // tanhshrink(x) = x - tanh(x)
 // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
@@ -400,7 +430,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
+    out.device(d) = x * (temp1 + temp2).template cast<T>();
   }
 };
 
@@ -417,7 +447,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1734,6 +1764,58 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
+template <typename DeviceContext, typename Functor>
+class TanhDoubleGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor *Out, *ddX, *dOut;
+    framework::Tensor *dOutNew, *ddOut;
+    Out = ddX = dOut = nullptr;
+    dOutNew = ddOut = nullptr;
+
+    // extract ddx(input) and out(input)
+    auto ddx_var = ctx.InputVar("DDX");
+    auto out_var = ctx.InputVar("Out");
+    PADDLE_ENFORCE_NOT_NULL(
+        ddx_var, platform::errors::NotFound(
+                     "Cannot get input Variable ddx, variable name = %s",
+                     ctx.InputName("DDX")));
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var, platform::errors::NotFound(
+                     "Cannot get input Variable out, variable name = %s",
+                     ctx.InputName("Out")));
+    ddX = ctx.Input<framework::Tensor>("DDX");
+    Out = ctx.Input<framework::Tensor>("Out");
+
+    // set output ddout
+    auto ddout_var = ctx.OutputVar("DDOut");
+    if (ddout_var) {
+      ddOut = ctx.Output<framework::Tensor>("DDOut");
+    }
+
+    // extract dOut(intput)
+    auto dout_var = ctx.InputVar("DOut");
+    PADDLE_ENFORCE_NOT_NULL(
+        dout_var, platform::errors::NotFound(
+                      "Cannot get input Variable dout_var, variable name = %s",
+                      ctx.InputName("DOut")));
+    dOut = ctx.Input<framework::Tensor>("DOut");
+
+    // set output dout_new
+    auto dout_new_var = ctx.OutputVar("DOutNew");
+    if (dout_new_var) {
+      dOutNew = ctx.Output<framework::Tensor>("DOutNew");
+    }
+
+    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    auto& place = ctx.template device_context<DeviceContext>();
+    Functor functor;
+    functor(place, Out, ddX, dOut, dOutNew, ddOut);
+  }
+};
 template <typename DeviceContext, typename Functor>
 class SquareDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2048,7 +2130,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor);                          \
   __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
new file mode 100644
index 00000000000000..923b581af287d1
--- /dev/null
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -0,0 +1,368 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class PowNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto factor = ctx.Attr<float>("factor");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Power", {*x}, {*out},
+                              {{"power", factor},
+                               {"scale", static_cast<float>(1.0)},
+                               {"shift", static_cast<float>(0.0)}});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PowGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto factor = ctx.Attr<float>("factor");
+
+    auto x_dims = x->dims();
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(liym27): dx = dout * factor * x.pow(factor-1)
+
+    // Step1: Compute x_pow = x.pow(factor-1)
+    Tensor x_pow(x->type());
+    x_pow.mutable_data<T>(x->dims(), place);
+    auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow},
+                                  {{"power", factor - static_cast<float>(1)}});
+    runner_pow.Run(stream);
+
+    // Step 2: Construct a broadcast factor, which has the same shape with x.
+
+    // 2.1 Get a factor tensor with shape [1].
+    Tensor factor_tensor(framework::proto::VarType::FP32);
+    factor_tensor.mutable_data<float>({1}, place);
+    TensorFromVector(std::vector<float>{factor}, ctx.device_context(),
+                     &factor_tensor);
+
+    // 2.2 Get the factor which has the shape with x and the same value with
+    // factor.
+    Tensor factor_bc_tensor(framework::proto::VarType::FP32);
+    factor_bc_tensor.mutable_data<float>(x_dims, place);
+    auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
+                                 {{"dims", framework::vectorize(x_dims)}});
+    runner_bc.Run(stream);
+
+    // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
+    Tensor x_power_mul_factor(x->type());
+    x_power_mul_factor.mutable_data<T>(x->dims(), place);
+    auto runner_mul_1 =
+        NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
+    runner_mul_1.Run(stream);
+
+    // Step 4: Compute dx = dout * factor * x.pow(factor-1)
+    dx->mutable_data<T>(place);
+    auto runner_mul_2 =
+        NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
+    runner_mul_2.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReluNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Relu",
+                              {
+                                  *x,
+                              },
+                              {*out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReluGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    dx->mutable_data<T>(ctx.GetPlace());
+    auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
+
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SqrtNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SqrtGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor one(x->type());
+    one.mutable_data<T>(x->dims(), place);
+    auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {});
+    one_runner.Run(stream);
+
+    Tensor sub(x->type());
+    sub.mutable_data<T>(x->dims(), place);
+    auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {});
+    sub_runner.Run(stream);
+
+    auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {});
+    out_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TanhNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TanhGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SquareNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Square", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    pow, ops::PowNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::PowNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    pow_grad, ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    relu, ops::ReluNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReluNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    relu_grad,
+    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    sqrt, ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    sqrt_grad,
+    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    log, ops::LogNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LogNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    log_grad, ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    tanh, ops::TanhNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TanhNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    tanh_grad,
+    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    square, ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
+                         paddle::platform::float16>,
+    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h
index 97e3ed9c1adda0..ecfd10d2fa6fbd 100644
--- a/paddle/fluid/operators/addmm_op.h
+++ b/paddle/fluid/operators/addmm_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -32,8 +33,8 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
 
 using Tensor = framework::Tensor;
 
@@ -105,7 +106,8 @@ class AddMMKernel : public framework::OpKernel<T> {
     auto eigen_out = EigenTensor<T, 2>::From(*out);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(place) = eigen_input.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
+        place, eigen_out, eigen_input, bcast_dims);
 
     blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha,
               x->data<T>(), x_dims[1], y->data<T>(), y_dims[1], beta,
diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt
index b3ff52a7ae119d..2ea8bbcbc61df8 100644
--- a/paddle/fluid/operators/amp/CMakeLists.txt
+++ b/paddle/fluid/operators/amp/CMakeLists.txt
@@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 register_operators()
+
+if(WITH_ASCEND_CL)
+    cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index 6840e4847c4c64..2c3a9c366e4fd0 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -26,18 +26,48 @@ __global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) {
 }
 
 template <typename T, typename MT>
-__global__ void CheckFiniteAndUnscale(const T* in, const MT* scale, int num,
-                                      bool* found_inf, T* out) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (idx < num) {
-    MT val = static_cast<MT>(in[idx]) * (*scale);
+__global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale,
+                                      int64_t size, int64_t* starts,
+                                      bool* found_inf, T** outs) {
+  const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // copy starts array from global memory to shared memory
+  extern __shared__ int64_t s_starts[];
+  for (int i = threadIdx.x; i <= size; i += blockDim.x) {
+    s_starts[i] = starts[i];
+  }
+  __syncthreads();
+
+  const int64_t num = s_starts[size];
+  int pre_xs_index = 0;
+  bool t_found_inf = false;
+  const MT t_scale = *scale;
+  for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) {
+    // get the xs's index of thread
+    int xs_index = pre_xs_index;
+    while (idx < s_starts[xs_index]) xs_index++;
+    // avoid some tensor's numel is zero
+    while (idx >= s_starts[xs_index]) xs_index++;
+    pre_xs_index = xs_index - 1;
+
+    // get in data and out data
+    const T* in = xs[pre_xs_index];
+    T* out = outs[pre_xs_index];
+    int64_t in_idx = idx - s_starts[pre_xs_index];
+
+    // Unscale
+    MT val = static_cast<MT>(in[in_idx]) * t_scale;
     T narrow_val = static_cast<T>(val);
-    out[idx] = narrow_val;
+    out[in_idx] = narrow_val;
+
+    // CheckFinite
     if (!isfinite(narrow_val)) {
-      *found_inf = true;
+      t_found_inf = true;
     }
   }
+  if (t_found_inf) {
+    *found_inf = true;
+  }
 }
 
 template <typename T>
@@ -63,20 +93,53 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
     InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(
         scale_data, inverse_scale_v, found_inf_data);
 
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      const T* x_data = x->data<T>();
-      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-      int num = x->numel();
-      int block = 1024;
-      int grid = (num + block - 1) / block;
-      VLOG(3) << "launch kernel";
-      CheckFiniteAndUnscale<T, MPDType><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, inverse_scale_v, num, found_inf_data, out_data);
-      VLOG(3) << "finish kernel";
+    size_t xs_size = xs.size();
+    // calculate each tensor's start index and copy to device
+    auto h_starts_tensor =
+        memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t));
+    int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
+
+    auto d_starts_tensor =
+        memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+    int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
+
+    h_starts[0] = 0;
+    for (int i = 1; i <= xs_size; i++) {
+      // the start index value of each tensor is
+      // the sum of previous tensor's size
+      h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
+    }
+    int64_t total_num = h_starts[xs_size];
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_starts, platform::CPUPlace(), h_starts,
+                 (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
+
+    // copy each tensor's data address to device
+    auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*));
+    const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
+    T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
+
+    auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*));
+    const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
+    T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
+
+    for (size_t i = 0; i < xs_size; ++i) {
+      h_xs[i] = xs[i]->data<T>();
+      h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
     }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs,
+                 platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*),
+                 dev_ctx.stream());
+
+    // Launch Kernel
+    int block = 1024;
+    int block_num = block * 20;  // each thread deal with 20 number
+    int grid = (total_num + block_num - 1) / block_num;
+    VLOG(3) << "launch kernel";
+    CheckFiniteAndUnscale<T, MPDType><<<
+        grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
+        d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
+    VLOG(3) << "finish kernel";
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
new file mode 100644
index 00000000000000..46f9f7ff089448
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    found_inf->mutable_data<bool>(ctx.GetPlace());
+
+    bool found_inf_data = false;
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // step1: inverse scale(RealDiv)
+    Tensor const_tensor;
+    const_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<T>{static_cast<T>(1.0)}, ctx.device_context(),
+                     &const_tensor);
+
+    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+
+    // Inverse(1.0/scale)
+    Tensor* tmp_inverse_out = const_cast<Tensor*>(scale);
+    Tensor inverse_out(scale->type());
+    inverse_out.Resize(scale->dims());
+    inverse_out.mutable_data<T>(ctx.GetPlace());
+    auto runner_inverse =
+        NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
+    runner_inverse.Run(stream);
+    tmp_inverse_out = &inverse_out;
+
+    size_t x_size = xs.size();
+    for (size_t i = 0; i < x_size; ++i) {
+      found_inf_data = true;
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+
+      // step2: CheckNumerics
+      // CheckNumerics runs on the Ascend AI CPU, which delivers poor
+      // performance.
+      Tensor check_xout(x->type());
+      check_xout.Resize(x->dims());
+      check_xout.mutable_data<T>(ctx.GetPlace());
+      try {
+        auto runner_checknumerics =
+            NpuOpRunner("CheckNumerics", {*x}, {check_xout},
+                        {{"message", std::string("check_nan_and_inf")}});
+        runner_checknumerics.Run(stream);
+      } catch (platform::EnforceNotMet& exception) {
+        LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
+        found_inf_data = true;
+      } catch (...) {
+        LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
+        found_inf_data = true;
+      }
+
+      if (!found_inf_data) {
+        // MatMul
+        auto runner_matmul =
+            NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
+        runner_matmul.Run(stream);
+      } else {
+        // ZerosLike
+        auto runner_zeroslike = NpuOpRunner("ZerosLike", {*x}, {*out}, {});
+        runner_zeroslike.Run(stream);
+      }  // end if
+    }    // end for
+
+    // set found_inf to true
+    if (found_inf_data) {
+      Tensor found_inf_tensor;
+      found_inf_tensor.Resize({1});
+      bool* is_found_inf =
+          found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
+      *is_found_inf = true;
+      framework::TensorCopySync(found_inf_tensor, ctx.GetPlace(), found_inf);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleNPUKernel<float>,
+                       ops::CheckFiniteAndUnscaleNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
new file mode 100644
index 00000000000000..99e81a4757d0e0
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <random>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+using Tensor = paddle::framework::Tensor;
+
+USE_OP(check_finite_and_unscale);
+USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
+
+struct InputVars {
+  std::string name;
+  f::LoDTensor *tensor;
+};
+
+template <typename T>
+void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
+  const f::DDim dims = f::make_ddim({2, 2});
+  auto place = ctx.GetPlace();
+
+  // init input
+  std::vector<InputVars> input_names = {
+      {"x", scope->Var("x")->GetMutable<f::LoDTensor>()},
+      {"x1", scope->Var("x1")->GetMutable<f::LoDTensor>()}};
+
+  auto *scale = scope->Var("scale")->GetMutable<f::LoDTensor>();
+
+  // init output
+  auto *out = scope->Var("out")->GetMutable<f::LoDTensor>();
+  auto *out1 = scope->Var("out1")->GetMutable<f::LoDTensor>();
+  auto *found_inf = scope->Var("found_inf")->GetMutable<f::LoDTensor>();
+
+  // Initialize input data
+  const int num_inputs = input_names.size();
+  size_t numel = static_cast<size_t>(f::product(dims));
+
+  for (int i = 0; i < num_inputs; ++i) {
+    std::vector<T> init_xs;
+    for (size_t j = 0; j < numel; ++j) {
+      if (j == 0) {
+        init_xs.push_back(static_cast<T>(NAN));
+      } else {
+        init_xs.push_back(static_cast<T>(j + 1));
+      }
+    }
+    f::TensorFromVector(init_xs, ctx, input_names[i].tensor);
+    input_names[i].tensor->Resize(dims);
+  }
+
+  f::TensorFromVector(std::vector<T>{static_cast<T>(0.5)}, ctx, scale);
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(
+      "check_finite_and_unscale", {{"X", {"x", "x1"}}, {"Scale", {"scale"}}},
+      {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  // out0
+  std::vector<T> out_vec;
+  f::TensorToVector(*out, ctx, &out_vec);
+  EXPECT_EQ(out_vec.size(), static_cast<size_t>(4));
+  for (size_t j = 0; j < out_vec.size(); ++j) {
+    VLOG(3) << "out_vec[" << j << "]:" << out_vec[j];
+  }
+
+  ctx.Wait();
+
+  // out0
+  std::vector<T> out1_vec;
+  f::TensorToVector(*out1, ctx, &out1_vec);
+  EXPECT_EQ(out1_vec.size(), static_cast<size_t>(4));
+  for (size_t j = 0; j < out1_vec.size(); ++j) {
+    VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j];
+  }
+
+  ctx.Wait();
+
+  // out found_inf
+  Tensor found_inf_tensor;
+  found_inf_tensor.Resize({1});
+  bool *is_finite_data =
+      found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
+  f::TensorCopy(*found_inf, place, &found_inf_tensor);
+  EXPECT_FALSE(*is_finite_data);
+
+  ctx.Wait();
+}
+
+TEST(check_finite_and_unscale, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
+
+TEST(check_finite_and_unscale, NPU_fp16) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<p::float16>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
new file mode 100644
index 00000000000000..dd6dbfd5c0b653
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -0,0 +1,219 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+void Update(const platform::NPUDeviceContext& ctx,
+            const std::vector<bool> found_inf_vec,
+            const Tensor* pre_loss_scaling_tensor, const Tensor* good_in_tensor,
+            const Tensor* bad_in_tensor, const int incr_every_n_steps,
+            const int decr_every_n_nan_or_inf, const float incr_ratio,
+            const float decr_ratio, Tensor* updated_loss_scaling_tensor,
+            Tensor* good_out_tensor, Tensor* bad_out_tensor) {
+  auto place = ctx.GetPlace();
+  auto stream = ctx.stream();
+  if (found_inf_vec[0]) {
+    // good_out_data = 0
+    auto g = good_out_tensor->mutable_data<int>(place);
+    platform::NPUMemsetAsync(static_cast<void*>(g), 0,
+                             good_out_tensor->numel() * sizeof(int), stream);
+    // bad_out_data = bad_in_data + 1
+    Tensor factor_tensor(bad_out_tensor->type());
+    factor_tensor.mutable_data<int>({1}, place);
+    TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor);
+    auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
+                                 {*bad_out_tensor}, {});
+    runner_p2.Run(stream);
+
+    std::vector<int> bad_out_data;
+    TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
+    if (bad_out_data[0] == decr_every_n_nan_or_inf) {
+      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                   {*updated_loss_scaling_tensor},
+                                   {{"power", static_cast<float>(1)},
+                                    {"scale", decr_ratio},
+                                    {"shift", static_cast<float>(0)}});
+
+      runner_p3.Run(stream);
+
+      std::vector<T> new_loss_scaling;
+      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
+      if (new_loss_scaling[0] < static_cast<T>(1)) {
+        // updated_loss_scaling_data = 1
+        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                     {*updated_loss_scaling_tensor},
+                                     {{"power", static_cast<float>(1)},
+                                      {"scale", static_cast<float>(0)},
+                                      {"shift", static_cast<float>(1)}});
+
+        runner_p4.Run(stream);
+      }
+
+      // bad_out_data = 0
+      auto b = bad_out_tensor->mutable_data<int>(place);
+      platform::NPUMemsetAsync(static_cast<void*>(b), 0,
+                               bad_out_tensor->numel() * sizeof(int), stream);
+    }
+  } else {
+    // bad_out_data = 0
+    auto b = bad_out_tensor->mutable_data<int>(place);
+    platform::NPUMemsetAsync(static_cast<void*>(b), 0,
+                             bad_out_tensor->numel() * sizeof(int), stream);
+
+    // good_out_data = good_in_data + 1
+    Tensor factor_tensor(good_out_tensor->type());
+    factor_tensor.mutable_data<int>({1}, place);
+    TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor);
+    auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
+                                 {*good_out_tensor}, {});
+    runner_p2.Run(stream);
+
+    std::vector<int> good_out_data;
+    TensorToVector(*good_out_tensor, ctx, &good_out_data);
+
+    if (good_out_data[0] == incr_every_n_steps) {
+      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                   {*updated_loss_scaling_tensor},
+                                   {{"power", static_cast<float>(1)},
+                                    {"scale", incr_ratio},
+                                    {"shift", static_cast<float>(0)}});
+      runner_p3.Run(stream);
+
+      std::vector<T> new_loss_scaling;
+      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
+      if (!std::isfinite(new_loss_scaling[0])) {
+        // updated_loss_scaling_data = pre_loss_scaling_data
+        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                     {*updated_loss_scaling_tensor},
+                                     {{"power", static_cast<float>(1)},
+                                      {"scale", static_cast<float>(1)},
+                                      {"shift", static_cast<float>(0)}});
+
+        runner_p4.Run(stream);
+      }
+      // good_out_data = 0
+      auto g = good_out_tensor->mutable_data<int>(place);
+      platform::NPUMemsetAsync(static_cast<void*>(g), 0,
+                               good_out_tensor->numel() * sizeof(int), stream);
+    }
+  }
+}
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::NPUDeviceContext, T> {
+ public:
+  void operator()(const platform::NPUDeviceContext& dev_ctx,
+                  const std::vector<bool> found_inf_vec,
+                  const Tensor* pre_loss_scaling_tensor,
+                  const Tensor* good_in_tensor, const Tensor* bad_in_tensor,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, Tensor* updated_loss_scaling_tensor,
+                  Tensor* good_out_tensor, Tensor* bad_out_tensor) const {
+    Update<T>(dev_ctx, found_inf_vec, pre_loss_scaling_tensor, good_in_tensor,
+              bad_in_tensor, incr_every_n_steps, decr_every_n_nan_or_inf,
+              incr_ratio, decr_ratio, updated_loss_scaling_tensor,
+              good_out_tensor, bad_out_tensor);
+  }
+};
+
+template <typename T>
+class LazyZerosNPU {
+ public:
+  void operator()(const platform::NPUDeviceContext& dev_ctx,
+                  const std::vector<bool> found_inf_vec,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    for (size_t i = 0; i < xs.size(); ++i) {
+      auto* out = outs[i];
+      if (found_inf_vec[0]) {
+        VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --";
+
+        auto place = dev_ctx.GetPlace();
+        auto stream = dev_ctx.stream();
+        auto g = out->mutable_data<T>(place);
+        platform::NPUMemsetAsync(static_cast<void*>(g), 0,
+                                 out->numel() * sizeof(T), stream);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
+    PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "FoundInfinite must has only one element."));
+
+    std::vector<bool> found_inf_vec;
+    TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec);
+
+    LazyZerosNPU<T>{}(dev_ctx, found_inf_vec, xs, outs);
+    const bool stop_update = ctx.Attr<bool>("stop_update");
+    if (stop_update) {
+      return;
+    }
+
+    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
+    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
+    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
+    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
+    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
+    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
+
+    updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
+    good_out->mutable_data<int>(dev_ctx.GetPlace());
+    bad_out->mutable_data<int>(dev_ctx.GetPlace());
+
+    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
+    const int decr_every_n_nan_or_inf =
+        ctx.Attr<int>("decr_every_n_nan_or_inf");
+    const float incr_ratio = ctx.Attr<float>("incr_ratio");
+    const float decr_ratio = ctx.Attr<float>("decr_ratio");
+    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
+        dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in,
+        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+        updated_loss_scaling, good_out, bad_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    update_loss_scaling,
+    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext,
+                                    double>);
diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc
new file mode 100644
index 00000000000000..93689d5e495f33
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/operators/assign_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class AssignNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    assign, ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
new file mode 100644
index 00000000000000..5cf1303a229a90
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(assign);
+USE_OP_DEVICE_KERNEL(assign, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  init.push_back(static_cast<T>(1.0));
+  init.push_back(static_cast<T>(2.0));
+  init.push_back(static_cast<T>(3.0));
+  init.push_back(static_cast<T>(4.0));
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({4});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  auto op =
+      f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
+  EXPECT_EQ(out_vec[0], static_cast<T>(1.0));
+  EXPECT_EQ(out_vec[1], static_cast<T>(2.0));
+  EXPECT_EQ(out_vec[2], static_cast<T>(3.0));
+  EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
+}
+
+TEST(assign, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "assign");
+}
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 99153101fc326c..8bd2b7fe2d127c 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -32,6 +32,11 @@ __global__ void GPUBCELossForward(const T* x_data, const T* label_data,
     T one = static_cast<T>(1.);
     T neg_100 = static_cast<T>(-100.);
 
+    PADDLE_ENFORCE(
+        (x >= static_cast<T>(0)) && (x <= one),
+        "Input is expected to be within the interval [0, 1], but recieved %f.",
+        x);
+
     T term1 = max(real_log(x), neg_100);
     T term2 = max(real_log(one - x), neg_100);
 
@@ -64,29 +69,13 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     auto* labels = ctx.Input<Tensor>("Label");
     auto* out = ctx.Output<Tensor>("Out");
 
-    auto x_data = x->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+    const auto* x_data = x->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
     auto x_numel = x->numel();
 
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel);
-
-    Tensor x_cpu;
-    framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
-    T* x_cpu_data = x_cpu.data<T>();
-
-    for (int64_t i = 0; i < x_numel; ++i) {
-      PADDLE_ENFORCE_GE(
-          x_cpu_data[i], static_cast<T>(0),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be greater than  or equal to 0"));
-      PADDLE_ENFORCE_LE(
-          x_cpu_data[i], static_cast<T>(1),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be less than or equal to 1"));
-    }
-
     auto& dev_ctx = ctx.cuda_device_context();
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
 
     GPUBCELossForward<T><<<config.block_per_grid, config.thread_per_block, 0,
                            dev_ctx.stream()>>>(x_data, labels->data<T>(),
@@ -102,9 +91,10 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
     auto* labels = ctx.Input<Tensor>("Label");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int x_numel = x->numel();
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+
     auto& dev_ctx = ctx.cuda_device_context();
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
new file mode 100644
index 00000000000000..20b33c4e4e05a6
--- /dev/null
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+static std::map<framework::proto::VarType::Type, aclDataType>
+    DTYPE_2_ACL_DTYPE = {
+        {framework::proto::VarType::BOOL, ACL_BOOL},
+        {framework::proto::VarType::INT16, ACL_INT16},
+        {framework::proto::VarType::INT32, ACL_INT32},
+        {framework::proto::VarType::INT64, ACL_INT64},
+        {framework::proto::VarType::FP16, ACL_FLOAT16},
+        {framework::proto::VarType::FP32, ACL_FLOAT},
+        {framework::proto::VarType::FP64, ACL_DOUBLE},
+};
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CastNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    int dtype = ctx.Attr<int>("out_dtype");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    auto iter = DTYPE_2_ACL_DTYPE.find(
+        static_cast<framework::proto::VarType::Type>(dtype));
+    int aclDtype = iter->second;
+
+    if (dtype == framework::proto::VarType::FP32) {
+      out->mutable_data<float>(place);
+    } else if (dtype == framework::proto::VarType::FP16) {
+      out->mutable_data<paddle::platform::float16>(place);
+    } else if (dtype == framework::proto::VarType::INT16) {
+      out->mutable_data<int16_t>(place);
+    } else if (dtype == framework::proto::VarType::INT32) {
+      out->mutable_data<int32_t>(place);
+    } else if (dtype == framework::proto::VarType::INT64) {
+      out->mutable_data<int64_t>(place);
+    } else if (dtype == framework::proto::VarType::FP64) {
+      out->mutable_data<double>(place);
+    } else if (dtype == framework::proto::VarType::BOOL) {
+      out->mutable_data<bool>(place);
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Cast", {*x}, {*out},
+                              {{"dst_type", static_cast<int32_t>(aclDtype)}});
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddleaclDtype
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    cast, ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index bbd43274a002d8..ca15858cf67d75 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -23,8 +23,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+class XPUFPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class XPUFPTypeTrait<platform::float16> {
+ public:
+  using Type = float16;
+};
+
 template <typename DeviceContext, typename InT>
 class CastXPUKernel : public framework::OpKernel<InT> {
+  using XPUInTDType = typename XPUFPTypeTrait<InT>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
@@ -34,27 +48,39 @@ class CastXPUKernel : public framework::OpKernel<InT> {
     auto out_type = static_cast<framework::proto::VarType::Type>(
         context.Attr<int>("out_dtype"));
     auto* in_data = in->data<InT>();
+
+    // using XPUOutTDType = typename XPUFPTypeTrait<InT>::Type;
     auto numel = in->numel();
     auto& dev_ctx = context.template device_context<DeviceContext>();
     int r = -1;
     if (out_type == framework::proto::VarType::FP32) {
       auto* out_data = out->mutable_data<float>(context.GetPlace());
-      r = xpu::cast_v2<InT, float>(dev_ctx.x_context(), in_data, out_data,
-                                   numel);
+      r = xpu::cast_v2<XPUInTDType, float>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if (out_type == framework::proto::VarType::INT32) {
       auto* out_data = out->mutable_data<int>(context.GetPlace());
-      r = xpu::cast_v2<InT, int32_t>(dev_ctx.x_context(), in_data, out_data,
-                                     numel);
+      r = xpu::cast_v2<XPUInTDType, int32_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if (out_type == framework::proto::VarType::INT64) {
       auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
-      r = xpu::cast_v2<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
-                                     numel);
+      r = xpu::cast_v2<XPUInTDType, int64_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if ((out_type == framework::proto::VarType::BOOL) &&
                (in_type == framework::proto::VarType::FP32)) {
       auto* out_data = out->mutable_data<bool>(context.GetPlace());
       r = xpu::cast_v2<float, int8_t>(
           dev_ctx.x_context(), (const float*)in_data,
           reinterpret_cast<int8_t*>(out_data), numel);
+    } else if (out_type == framework::proto::VarType::FP16) {
+      auto* out_data =
+          out->mutable_data<paddle::platform::float16>(context.GetPlace());
+      r = xpu::cast_v2<XPUInTDType, float16>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          reinterpret_cast<float16*>(out_data), numel);
+
     } else {
       PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
                                                  in_type, out_type));
@@ -75,5 +101,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
     cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int32_t>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext,
+                       paddle::platform::float16>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
 #endif
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 8920541b9b9dcc..977a208d20e783 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -19,6 +19,12 @@ if(WITH_NCCL OR WITH_RCCL)
     op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
+if(WITH_ASCEND)
+    op_library(gen_nccl_id_op)
+    op_library(c_gen_nccl_id_op)
+endif()
+
+
 if(WITH_GLOO)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
 endif()
diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc
index 86f1c28a9dd4f5..63b135a74cf4b7 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
 
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+#include "paddle/fluid/operators/collective/allreduce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc
index 9b70f78399026b..fe2e4910552706 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+#include "paddle/fluid/operators/collective/allreduce_op.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 1592d809f91e26..7da30f64d1ce39 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -84,6 +85,21 @@ class CGenNCCLIdOp : public framework::OperatorBase {
   }
 };
 
+#else
+class CGenNCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenNCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
 class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index c4abe284d72096..700d1173e2ff68 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -15,40 +15,20 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
-class CSyncCalcStreamOp : public framework::OperatorBase {
+class CSyncCalcStreamOp : public framework::OperatorWithKernel {
  public:
-  CSyncCalcStreamOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on gpu place only for now."));
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
-    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
-#endif
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
   }
 };
 
@@ -65,10 +45,36 @@ Call calculation stream synchronization.
   }
 };
 
+template <typename T>
+class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+
+    auto place = ctx.GetPlace();
+    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
+#endif
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp,
-                  ops::CSyncCalcStreamOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp,
+                             ops::CSyncCalcStreamOpMaker);
+
+REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream,
+                        ops::CSyncCalcStreamCudaKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index adf27069f524e4..95b9cd040fe94e 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -14,45 +14,25 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 namespace paddle {
 namespace operators {
 
-class CSyncCommStreamOp : public framework::OperatorBase {
+class CSyncCommStreamOp : public framework::OperatorWithKernel {
  public:
-  CSyncCommStreamOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on gpu place only for now."));
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    int ring_id = Attr<int>("ring_id");
-    auto stream =
-        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
   }
 };
 
@@ -72,10 +52,38 @@ Call communication stream synchronization.
   }
 };
 
+template <typename T>
+class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+
+    auto place = ctx.GetPlace();
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
+
+#ifdef PADDLE_WITH_RCCL
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp,
-                  ops::CSyncCommStreamOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp,
+                             ops::CSyncCommStreamOpMaker);
+
+REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream,
+                        ops::CSyncCommStreamCudaKernel<float>);
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 679713d05bcb40..99a92469e8502b 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -34,6 +34,7 @@ class Scope;
 namespace paddle {
 namespace operators {
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -194,6 +195,20 @@ class GenNCCLIdOp : public framework::OperatorBase {
   }
 };
 
+#else
+class GenNCCLIdOp : public framework::OperatorBase {
+ public:
+  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
 class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc
new file mode 100644
index 00000000000000..87bb3397ca2672
--- /dev/null
+++ b/paddle/fluid/operators/concat_op_npu.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ConcatNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+    auto axis = ctx.Attr<int>("axis");
+
+    if (ctx.HasInput("AxisTensor")) {
+      PADDLE_THROW(platform::errors::NotFound(
+          "The AxisTensor is not supported on NPU now."));
+    }
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    std::vector<framework::Tensor> inputs;
+    std::vector<std::string> names;
+    for (size_t i = 0; i < ins.size(); ++i) {
+      if (ins[i] && ins[i]->numel() > 0) {
+        inputs.push_back(*ins[i]);
+        names.push_back("x" + std::to_string(i));
+      } else {
+        continue;
+      }
+    }
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner(
+        "ConcatD", {inputs}, {*out},
+        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}});
+    runner.AddInputNames(names);
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class ConcatGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+
+    auto axis = ctx.Attr<int>("axis");
+
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+
+    int offset = 0;
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    for (size_t j = 0; j < outs.size(); ++j) {
+      // For stop gradient
+      // get output tensor that the name is not kEmptyVarName
+      if (out_var_names[j] != framework::kEmptyVarName &&
+          outs[j]->numel() != 0UL) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        std::vector<int> offsets;
+        std::vector<int> sizes;
+        for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
+          if (dim == axis) {
+            offsets.push_back(offset);
+            sizes.push_back(ins[j]->dims()[dim]);
+          } else {
+            offsets.push_back(0);
+            sizes.push_back(ins[j]->dims()[dim]);
+          }
+        }
+        auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
+                                  {{"offsets", offsets}, {"size", sizes}});
+        runner.Run(stream);
+      }
+      if (ins[j]->numel() != 0UL) {
+        offset += ins[j]->dims()[axis];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel<float>,
+                       ops::ConcatNPUKernel<paddle::platform::float16>,
+                       ops::ConcatNPUKernel<int>);
+
+REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel<float>,
+                       ops::ConcatGradNPUKernel<paddle::platform::float16>,
+                       ops::ConcatGradNPUKernel<int>);
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
index aa0002cc6d1777..be299babdba7a4 100644
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -132,16 +132,14 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
     axis = ComputeAxis(static_cast<int64_t>(axis),
                        static_cast<int64_t>(ins[0]->dims().size()));
     // get output tensor that the name is not kEmptyVarName
-    std::vector<framework::Tensor*> outputs;
-    std::vector<int> choose_idx;
-    int n = 0;
+    std::vector<T*> ptrs(outs.size());
     for (size_t j = 0; j < outs.size(); ++j) {
       if (out_var_names[j] != framework::kEmptyVarName &&
           outs[j]->numel() != 0UL) {
         outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs.push_back(outs[j]);
-        choose_idx.push_back(j);
-        n++;
+        ptrs[j] = outs[j]->data<T>();
+      } else {
+        ptrs[j] = nullptr;
       }
     }
     PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
@@ -157,10 +155,10 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
             axis, out_grad->dims().size()));
 
     auto input_dims = ins[0]->dims();
-    std::vector<int> split_list(n);
+    std::vector<int> split_list(ins.size());
     std::vector<int> xdims_list(input_dims.size());
     int total_length = 0;
-    for (int i = 0; i < n; ++i) {
+    for (size_t i = 0; i < ins.size(); ++i) {
       split_list[i] = ins[i]->dims()[axis];
       total_length += ins[i]->dims()[axis];
     }
@@ -172,11 +170,6 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
     }
     xdims_list[axis] = total_length;
 
-    std::vector<T*> ptrs(n);
-    for (int i = 0; i < n; ++i) {
-      ptrs[i] = outputs[i]->data<T>();
-    }
-
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     int r = xpu::split<T>(dev_ctx.x_context(), out_grad->data<T>(), ptrs,
                           xdims_list, split_list, axis);
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
new file mode 100644
index 00000000000000..591fb55936734f
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class EqualNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessThanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    // int axis = context.Attr<int>("axis");
+    z->mutable_data<bool>(ctx.GetPlace());  // allocate
+    auto runner = NpuOpRunner("Less", {*x, *y}, {*z});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel<float>,
+                       ops::EqualNPUKernel<plat::float16>,
+                       ops::EqualNPUKernel<int>);
+
+REGISTER_OP_NPU_KERNEL(
+    less_than,
+    ops::LessThanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LessThanNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index b9ea2ade6cb90b..6513bae839e989 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -78,6 +78,13 @@ class ConditionalOp : public framework::OperatorBase {
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
       res = cpu_tensor.data<bool>()[0];
+#endif
+    } else if (platform::is_npu_place(ips[0]->place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      framework::LoDTensor cpu_tensor;
+      framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
+      platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
+      res = cpu_tensor.data<bool>()[0];
 #endif
     } else {
       res = ips[0]->data<bool>()[0];
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index d86b6b48422d94..fdd1b776bd8fa3 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -44,6 +44,11 @@ static void DataCopy(const framework::LoDTensor &src_item,
       TensorCopySync(src_item, platform::CPUPlace(), dst_item);
     }
 #else
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(src_item.place())) {
+      platform::DeviceContextPool::Instance().Get(src_item.place())->Wait();
+    }
+#endif
     TensorCopySync(src_item, platform::CPUPlace(), dst_item);
 #endif
   } else {
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
new file mode 100644
index 00000000000000..1b0c0e444347af
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class LogicalNotNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    logical_not,
+    ops::LogicalNotNPUKernel<paddle::platform::NPUDeviceContext, bool>);
+
+#endif
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 39e9d37ddc6c75..ab535e341f7575 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -1363,7 +1363,14 @@ REGISTER_OP_KERNEL(
     conv2d_grad_grad, CUDNN, plat::CUDAPlace,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
+// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
+// Use depthwise_conv2d in MIOPEN to resolve this issue
+REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     depthwise_conv2d_grad_grad,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 364e3ab8d26c3f..94d1f707b74c2e 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -903,29 +903,19 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
               "and input channel number is %d",
               output->dims()[1], input->dims()[1]));
     }
-    // transform tensor
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output,
-                                             &transformed_output);
-
-    } else {
-      transformed_input = *input;
-      transformed_output = *output;
-    }
 
     // update padding and dilation
-    auto in_dims = transformed_input.dims();
+    auto in_dims = input->dims();
     auto filter_dims = filter.dims();
 
     framework::DDim in_data_dims;
-    in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_format);
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
 
     framework::DDim filter_data_dims =
         framework::slice_ddim(filter_dims, 2, filter_dims.size());
@@ -944,16 +934,12 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
 
     if (fuse_relu) {
       math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
-      depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings,
-                    dilations, &transformed_output);
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output, data_layout);
     } else {
       math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
-      depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings,
-                    dilations, &transformed_output);
-    }
-    if (channel_last) {
-      TransToChannelLast<DeviceContext, T>(context, &transformed_output,
-                                           output);
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output, data_layout);
     }
   }
 };
@@ -981,33 +967,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
         context.Attr<std::string>("padding_algorithm");
     const std::string data_format = context.Attr<std::string>("data_format");
 
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensor
-    Tensor transformed_input(input->type());
-    Tensor transformed_output_grad(output_grad->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
-                                             &transformed_output_grad);
-      TransToChannelFirst<DeviceContext, T>(context, output_grad,
-                                            &transformed_output_grad);
-
-    } else {
-      transformed_input = *input;
-      transformed_output_grad = *output_grad;
-    }
-
     // update padding and dilation
-    auto in_dims = transformed_input.dims();
+    auto in_dims = input->dims();
     auto filter_dims = filter.dims();
 
     framework::DDim in_data_dims;
-    in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_format);
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
     framework::DDim filter_data_dims =
         framework::slice_ddim(filter_dims, 2, filter_dims.size());
     std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
@@ -1025,33 +996,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      Tensor transformed_input_grad(input_grad->type());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
-                                               &transformed_input_grad);
-
-      } else {
-        transformed_input_grad = *input_grad;
-      }
-
-      set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
       if (fuse_relu) {
         math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
             depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, transformed_input, filter,
-                               transformed_output_grad, strides, paddings,
-                               dilations, &transformed_input_grad);
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad, data_layout);
       } else {
         math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
             depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, transformed_input, filter,
-                               transformed_output_grad, strides, paddings,
-                               dilations, &transformed_input_grad);
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
-                                             input_grad);
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad, data_layout);
       }
     }
 
@@ -1061,15 +1017,13 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
       if (fuse_relu) {
         math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
             depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, transformed_input,
-                                transformed_output_grad, strides, paddings,
-                                dilations, filter_grad);
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad, data_layout);
       } else {
         math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
             depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, transformed_input,
-                                transformed_output_grad, strides, paddings,
-                                dilations, filter_grad);
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad, data_layout);
       }
     }
   }
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index a712d31cf7e2c3..c4cd5854c0f78a 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -490,10 +490,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     bool deterministic = FLAGS_cudnn_deterministic;
     T* input_grad_data = nullptr;
     T* filter_grad_data = nullptr;
-    if (input_grad)
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    if (filter_grad)
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
 
     if (input_grad) {
       input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
@@ -884,7 +880,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 
     int iwo_group = groups;
     int c_group = 1;
-#if CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
     iwo_group = 1;
     c_group = groups;
     groups = 1;
@@ -948,7 +944,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         args1.idesc.set(transformed_ddO_channel, iwo_group);
         args1.wdesc.set(*W, layout, iwo_group);
         args1.odesc.set(transformed_ddX, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+        args1.cdesc.set(dtype, padding_common, strides, dilations,
+                        platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
         using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size = search1::GetWorkspaceSize(args1);
@@ -967,7 +964,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         args2.idesc.set(transformed_ddO_channel, iwo_group);
         args2.wdesc.set(*ddW, layout, iwo_group);
         args2.odesc.set(transformed_X, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+        args2.cdesc.set(dtype, padding_common, strides, dilations,
+                        platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
         using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size =
@@ -991,7 +989,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 
       args3.odesc.set(transformed_ddX_channel, iwo_group);
 
-      args3.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+      args3.cdesc.set(dtype, padding_common, strides, dilations,
+                      platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
       using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
       workspace_size =
@@ -1013,7 +1012,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       args4.idesc.set(transformed_dO, iwo_group);
       args4.wdesc.set(*ddW, layout, iwo_group);
       args4.odesc.set(transformed_dX_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+      args4.cdesc.set(dtype, padding_common, strides, dilations,
+                      platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
       using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
@@ -1083,6 +1083,10 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       if (ddW) {
         for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
+          // MIOPEN ONLY support beta to be 0.0f
+          Tensor conv_x_ddw(dO->type());
+          conv_x_ddw.Resize(transformed_ddO_channel.dims());
+          T* conv_x_ddw_data = conv_x_ddw.mutable_data<T>(ctx.GetPlace());
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
                 PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1090,11 +1094,17 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                         handle, &alpha, args2.odesc.desc(),
                         x + i * group_offset_in, args2.wdesc.desc(),
                         ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        bwd_algo2, &alpha, args2.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out,
-                        workspace_ptr, workspace_size));
+                        bwd_algo2, &beta, args2.idesc.desc(),
+                        conv_x_ddw_data + i * group_offset_out, workspace_ptr,
+                        workspace_size));
               },
               workspace_size);
+          PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+              handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(),
+              transformed_ddy_channel + i * group_offset_out, &alpha,
+              args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta,
+              args2.idesc.desc(),
+              transformed_ddy_channel + i * group_offset_out));
 #else   // PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
index b4c27a63dbd2f2..388b8531571086 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -49,14 +49,11 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
     anchor_width = scale_w * base_w;
     anchor_height = scale_h * base_h;
 
-    T xmin = (x_ctr - 0.5 * (anchor_width - 1));
-    T ymin = (y_ctr - 0.5 * (anchor_height - 1));
-    T xmax = (x_ctr + 0.5 * (anchor_width - 1));
-    T ymax = (y_ctr + 0.5 * (anchor_height - 1));
-    out[i * 4] = xmin;
-    out[i * 4 + 1] = ymin;
-    out[i * 4 + 2] = xmax;
-    out[i * 4 + 3] = ymax;
+    T xmin = (x_ctr - .5f * (anchor_width - 1));
+    T ymin = (y_ctr - .5f * (anchor_height - 1));
+    T xmax = (x_ctr + .5f * (anchor_width - 1));
+    T ymax = (y_ctr + .5f * (anchor_height - 1));
+    reinterpret_cast<float4*>(out)[i] = make_float4(xmin, ymin, xmax, ymax);
   }
 }
 
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index e0e499d76a19ba..599f6935736f94 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -22,6 +22,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+extern __global__ void GenAnchors(T* out, const T* aspect_ratios,
+                                  const int ar_num, const T* anchor_sizes,
+                                  const int as_num, const T* stride,
+                                  const int sd_num, const int height,
+                                  const int width, const T offset);
+
+template <typename T>
+extern __global__ void SetVariance(T* out, const T* var, const int vnum,
+                                   const int num);
+#endif
+
 template <typename T>
 class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
deleted file mode 100644
index c9db6148bc45d4..00000000000000
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-return()
-
-if(WITH_GRPC)
-    set(cc_generic_services "false")
-else()
-    set(cc_generic_services "true")
-endif()
-configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
-
-cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool)
-cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder)
-
-cc_library(heart_beat_monitor SRCS heart_beat_monitor.cc DEPS enforce simple_threadpool)
-cc_library(large_scale_kv SRCS large_scale_kv.cc DEPS enforce simple_threadpool device_context)
-cc_test(heart_beat_monitor_test SRCS heart_beat_monitor_test.cc DEPS heart_beat_monitor)
-
-# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-if(WITH_GRPC)
-  set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr zlib protobuf)
-  set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
-  grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc
-        variable_response.cc
-        collective_client.cc collective_server.cc
-        ${GRPC_SRCS}
-      PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor large_scale_kv)
-
-  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
-
-  cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
-    DEPS ${RPC_DEPS} scope profiler math_function)
-
-else()
-  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
-  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
-  set(BRPC_DEPS brpc ssl crypto protobuf leveldb zlib)
-
-  brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc
-      variable_response.cc
-      collective_client.cc collective_server.cc
-      ${BRPC_SRCS}
-    PROTO send_recv.proto
-    DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
-
-  set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
-  cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
-      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_read_op)
-endif()
-
-
-cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op checkpoint_notify_op scale_op )
-cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
-cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory node)
-cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
-cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
-cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
-if(WITH_GPU OR WITH_ROCM)
-    cc_test(collective_server_test SRCS collective_server_test.cc 
-        DEPS sendrecvop_rpc executor ${RPC_DEPS}
-        selected_rows_functor  scope math_function)
-endif()
-if(WITH_TESTING)
-    if(TEST rpc_server_test)
-        set_tests_properties(rpc_server_test PROPERTIES TIMEOUT 120)
-    endif()
-    if(TEST heart_beat_monitor_test)
-        set_tests_properties(heart_beat_monitor_test PROPERTIES TIMEOUT 120)
-    endif()
-endif()
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
deleted file mode 100644
index 28a5f2ad6c7648..00000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class ConcurrentSet {
- public:
-  ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
-  ~ConcurrentSet() {}
-
-  std::future<void> Update(const std::vector<int64_t>& rows) {
-    auto task = [this, rows] {
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto& id : rows) {
-          sstream << id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "update ids -> " << sstream.str();
-      }
-      for (auto row : rows) {
-        set_.insert(row);
-      }
-    };
-    return pool_->enqueue(std::move(task));
-  }
-
-  std::future<void> GetAndClear(std::vector<int64_t>* result) {
-    auto task = [this, &result] {
-      result->clear();
-      for (auto& id : set_) {
-        result->push_back(id);
-      }
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto& id : *result) {
-          sstream << id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "result ids size: " << result->size() << " "
-                << sstream.str();
-      }
-      set_.clear();
-    };
-    return pool_->enqueue(std::move(task));
-  }
-
- private:
-  std::unordered_set<int64_t> set_;
-  std::unique_ptr<::ThreadPool> pool_{nullptr};
-};
-
-class AsyncSparseParamUpdateRecorder {
-  using TrainerToRows = std::vector<std::unique_ptr<ConcurrentSet>>;
-
- public:
-  AsyncSparseParamUpdateRecorder(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param)
-      : trainer_num_(trainer_num), grad_to_param_(grad_to_param) {
-    if (VLOG_IS_ON(3)) {
-      std::ostringstream sstream;
-      sstream << "[";
-      for (auto& item : grad_to_param) {
-        sstream << item.first << ":" << item.second << ", ";
-      }
-      sstream << "]";
-      VLOG(3) << "trainer_num: " << trainer_num
-              << " grad_to_param_: " << sstream.str();
-    }
-    for (auto& iter : grad_to_param) {
-      param_to_grad_[iter.second] = iter.first;
-      auto& param_name = iter.second;
-      param_to_updated_rows_[param_name] = TrainerToRows();
-      auto& trainer_to_rows = param_to_updated_rows_[param_name];
-      for (auto i = 0; i < trainer_num; ++i) {
-        trainer_to_rows.emplace_back(new ConcurrentSet());
-      }
-    }
-  }
-
-  ~AsyncSparseParamUpdateRecorder() = default;
-
-  void Update(const std::string& grad_name,
-              const std::vector<int64_t>& update_rows) {
-    VLOG(3) << "update grad: " << grad_name
-            << " row size: " << update_rows.size();
-    auto& param_name = grad_to_param_.at(grad_name);
-    auto& trainer_to_rows = param_to_updated_rows_.at(param_name);
-
-    std::vector<std::future<void>> fs;
-    for (auto& set : trainer_to_rows) {
-      fs.push_back(set->Update(update_rows));
-    }
-    for (auto& f : fs) {
-      f.wait();
-    }
-  }
-
-  void GetAndClear(const std::string& param_name, int trainer_id,
-                   std::vector<int64_t>* result) {
-    VLOG(3) << "GetAndClear param: " << param_name
-            << " for trainer: " << trainer_id;
-    PADDLE_ENFORCE_LT(
-        trainer_id, trainer_num_,
-        platform::errors::InvalidArgument(
-            "The value of trainer_id: %s should less than trainer_num: %s.",
-            trainer_id, trainer_num_));
-    param_to_updated_rows_.at(param_name)[trainer_id]
-        ->GetAndClear(result)
-        .wait();
-  }
-
-  bool HasParam(const std::string& param_name) {
-    return param_to_grad_.find(param_name) != param_to_grad_.end();
-  }
-
-  bool HasGrad(const std::string& grad_name) {
-    return grad_to_param_.find(grad_name) != grad_to_param_.end();
-  }
-
- private:
-  const int trainer_num_;
-  std::unordered_map<std::string, std::string> grad_to_param_;
-  std::unordered_map<std::string, std::string> param_to_grad_;
-  std::unordered_map<std::string, TrainerToRows> param_to_updated_rows_;
-
-  // init recorder
- public:
-  static void Init(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param) {
-    InitImpl(trainer_num, grad_to_param);
-  }
-
-  static AsyncSparseParamUpdateRecorder* GetInstance() {
-    return recorder_.get();
-  }
-
- private:
-  // Init is called by GetInstance.
-  static void InitImpl(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param) {
-    if (recorder_ == nullptr) {
-      recorder_.reset(
-          new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param));
-    }
-  }
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<AsyncSparseParamUpdateRecorder> recorder_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
deleted file mode 100644
index 2d78559625c91f..00000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include <algorithm>
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-TEST(ConcurrentSet, All) {
-  ConcurrentSet concurrent_set;
-  std::vector<int64_t> in1 = {1, 2, 3, 4};
-  std::vector<int64_t> in2 = {2, 3, 5, 6};
-
-  std::vector<std::future<void>> futures;
-  futures.push_back(concurrent_set.Update(in1));
-  futures.push_back(concurrent_set.Update(in2));
-
-  for (auto &f : futures) {
-    f.wait();
-  }
-
-  std::unordered_set<int64_t> in;
-  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
-  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
-
-  std::vector<int64_t> ret;
-  concurrent_set.GetAndClear(&ret).wait();
-
-  std::unordered_set<int64_t> out;
-  std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
-
-  EXPECT_EQ(in, out);
-
-  concurrent_set.GetAndClear(&ret).wait();
-  EXPECT_EQ(ret.size(), 0UL);
-}
-
-TEST(AsyncSparseParamUpdateRecorder, All) {
-  std::unordered_map<std::string, std::string> grad_to_param;
-  grad_to_param["grad1"] = "param1";
-  grad_to_param["grad2"] = "param2";
-
-  int trainer_num = 10;
-
-  AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param);
-  std::vector<int64_t> in1 = {1, 2, 3, 4};
-  std::vector<int64_t> in2 = {2, 3, 5, 6};
-
-  std::unordered_set<int64_t> in;
-  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
-  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
-
-  recorder.Update("grad1", in1);
-  recorder.Update("grad1", in2);
-
-  EXPECT_TRUE(recorder.HasParam("param1"));
-  EXPECT_TRUE(recorder.HasParam("param2"));
-  EXPECT_FALSE(recorder.HasParam("param3"));
-
-  EXPECT_TRUE(recorder.HasGrad("grad1"));
-  EXPECT_TRUE(recorder.HasGrad("grad2"));
-  EXPECT_FALSE(recorder.HasGrad("grad3"));
-
-  std::vector<int64_t> ret;
-  EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret));
-
-  for (int i = 0; i < trainer_num; ++i) {
-    std::vector<int64_t> ret;
-    std::unordered_set<int64_t> out;
-
-    recorder.GetAndClear("param1", i, &ret);
-    std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
-
-    EXPECT_EQ(in, out);
-
-    recorder.GetAndClear("param1", i, &ret);
-    EXPECT_EQ(ret.size(), 0UL);
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
deleted file mode 100644
index b2a26089c86896..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ /dev/null
@@ -1,462 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
-DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
-
-BRPCClient::~BRPCClient() { Wait(); }
-
-void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response,
-                        VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                        ChannelContextPtr ch_ctx, BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
-
-  // this channel can be used by other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to send variable %s, error text is %s.", var_h->name(),
-        cntl->ErrorText()));
-    var_h->Finish(false);
-    cls->DecreaseReqCount();
-    return;
-  }
-  var_h->Finish(true);
-  cls->DecreaseReqCount();
-
-  VLOG(4) << "HandleSendResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-  VLOG(4) << "Finish HandleSendResponse";
-}
-
-VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = kSendRPC;
-  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-    cntl->set_timeout_ms(time_out);
-
-    auto* var = p_scope->FindVar(var_name_val);
-    sendrecv::VariableMessage request;
-    distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request,
-                                  &cntl->request_attachment(), "", false,
-                                  trainer_id_);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    platform::RecordRPCEvent record_event(method);
-
-    ch_ctx->stub->SendVariable(cntl, &request, response, done);
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-  req_count_++;
-
-  return var_h;
-}
-void HandleFetchBarrierResponse(brpc::Controller* cntl,
-                                sendrecv::VariableMessage* response,
-                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                ChannelContextPtr ch_ctx, BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  // this channel can be used other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to get HandleFetchBarrierResponse %s, error text is %s.",
-        var_h->name(), cntl->ErrorText()));
-    var_h->Finish(false);
-    cls->DecreaseReqCount();
-    return;
-  }
-
-  var_h->Finish(true);
-  cls->DecreaseReqCount();
-
-  VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-  VLOG(4) << "Finish HandleFetchBarrierResponse";
-}
-void HandleGetResponse(brpc::Controller* cntl,
-                       sendrecv::VariableMessage* response, VarHandlePtr var_h,
-                       ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx,
-                       BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  // this channel can be used other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to get variable %s, error text is %s.", var_h->name(),
-        cntl->ErrorText()));
-    cls->DecreaseReqCount();
-    var_h->Finish(false);
-    return;
-  }
-
-  VLOG(4) << "HandleGetResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-
-  framework::Variable* outvar = nullptr;
-  int trainer_id;
-  distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(),
-                                    *var_h->ctx(), var_h->scope(), &outvar,
-                                    &trainer_id);
-  VLOG(4) << "Finish HandleGetResponse";
-  cls->DecreaseReqCount();
-  var_h->Finish(true);
-}
-
-VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      const std::string& out_var_name,
-                                      const std::string& method_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const std::string out_varname_val = out_var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = kGetRPC;
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-    cntl->set_timeout_ms(time_out);
-
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    req.set_out_varname(out_varname_val);
-    req.set_trainer_id(trainer_id_);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    platform::RecordRPCEvent record_event(method);
-
-    if (method_name == kGetMonomerRPC) {
-      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
-    } else if (method_name == kGetNoBarrierRPC) {
-      ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
-    } else {
-      ch_ctx->stub->GetVariable(cntl, &req, response, done);
-    }
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-
-  req_count_++;
-
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    const std::string& out_var_name, int64_t time_out) {
-  std::string var_name_no_barrier =
-      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
-
-  return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
-                      kGetNoBarrierRPC, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
-                      time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
-                                                const std::string& var_name,
-                                                int64_t time_out) {
-  return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
-                                     const platform::DeviceContext& ctx,
-                                     const framework::Scope& scope,
-                                     const std::string& var_name,
-                                     const std::string& out_var_name,
-                                     const std::string& table_name,
-                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
-                      time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& in_var_name,
-                                          const std::string& out_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-
-  const std::string method = kPrefetchRPC;
-
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-    cntl->set_timeout_ms(time_out);
-
-    auto* var = p_scope->FindVar(in_var_name_val);
-    sendrecv::VariableMessage req;
-    distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req,
-                                  &cntl->request_attachment(), out_var_name_val,
-                                  false, 0, table_name_val);
-
-    platform::RecordRPCEvent record_event(method);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    ch_ctx->stub->PrefetchVariable(cntl, &req, response, done);
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-
-  req_count_++;
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
-                          time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  auto ch_ptr = GetChannel(ep);
-  auto ch_ctx = ch_ptr->Pop();
-
-  brpc::Controller* cntl = new brpc::Controller();
-  sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-  cntl->set_timeout_ms(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-
-  const std::string method = kFetchBarrierRPC;
-  // var handle
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
-
-  platform::RecordRPCEvent record_event(method);
-
-  google::protobuf::Closure* done = brpc::NewCallback(
-      &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-  ch_ctx->stub->GetVariable(cntl, &req, response, done);
-
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    var_h->Wait();
-  }
-
-  return var_h;
-}
-
-bool BRPCClient::Wait() {
-  VLOG(9) << "begin to brpcclient wait";
-  {
-    std::unique_lock<std::mutex> lk(sync_mutex_);
-    sync_cond_.wait(lk, [this] { return req_count_ == 0; });
-  }
-  VLOG(9) << "end to brpcclient wait";
-  return true;
-}
-
-ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
-  VLOG(4) << "begin to GetChannel:" << ep;
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    auto it = channels_.find(ep);
-    if (it != channels_.end()) {
-      VLOG(4) << "end to GetChannel:" << ep;
-      return it->second;
-    }
-  }
-
-  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
-
-  brpc::ChannelOptions options;
-#ifdef PADDLE_WITH_BRPC_RDMA
-  options.use_rdma = true;
-#endif
-  options.protocol = "baidu_std";
-  // don't use pooled type. the server can't afford that.
-  options.connection_type = "single";
-  options.connect_timeout_ms = 1000;
-  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
-  options.max_retry = FLAGS_max_retry;
-
-  VLOG(1) << "create " << brpc_channel_num_per_server_
-          << " brpc channels to pserver:" << ep;
-
-  for (int i = 0; i < brpc_channel_num_per_server_; ++i) {
-    std::shared_ptr<ChannelContext> c(new ChannelContext());
-    if (c->channel.Init(ep.c_str(), &options) != 0) {
-      PADDLE_THROW(
-          platform::errors::Unavailable("Failed to initialize channel."));
-      return nullptr;
-    }
-
-    c->stub.reset(new sendrecv::SendRecvService_Stub(
-        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
-    q->Push(c);
-  }
-
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    channels_[ep] = q;
-  }
-
-  VLOG(4) << "end to GetChannel:" << ep;
-  return q;
-}
-
-VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
-                                           int64_t time_out) {
-  return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
-}
-
-void BRPCClient::SendComplete() {
-  for (auto& kv : channels_) {
-    AsyncSendComplete(kv.first);
-  }
-}
-
-VarHandlePtr BRPCClient::AsyncSendVarMessage(
-    const std::string& ep, const std::string& method_name,
-    const sendrecv::VariableMessage& req, int64_t time_out) {
-  auto ch_ptr = GetChannel(ep);
-  auto ch_ctx = ch_ptr->Pop();
-
-  brpc::Controller* cntl = new brpc::Controller();
-  sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-  cntl->set_timeout_ms(time_out);
-
-  platform::RecordRPCEvent record_event(method_name);
-
-  VarHandlePtr var_h(
-      new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
-
-  google::protobuf::Closure* done = brpc::NewCallback(
-      &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-  if (method_name == kCheckPointNotifyRPC) {
-    ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
-  } else if (method_name == kSendMonomerFetchBarrierRPC) {
-    ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
-  } else {
-    ch_ctx->stub->SendVariable(cntl, &req, response, done);
-  }
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    var_h->Wait();
-  }
-
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep,
-                                          const std::string& method_name,
-                                          const std::string& message,
-                                          int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(message);
-
-  return AsyncSendVarMessage(ep, method_name, req, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                               const std::string& dirname,
-                                               const std::string& varname,
-                                               const int mode,
-                                               int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(varname);
-  req.set_out_varname(dirname);
-
-  return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out);
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
deleted file mode 100644
index 91f94b4c9d5a30..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-
-#include <chrono>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct ChannelContext {
-  brpc::Channel channel;
-  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
-};
-
-typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
-typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
-    ChannelQueuePtr;
-
-class BRPCClient : public RPCClient {
- public:
-  BRPCClient() {}
-  virtual ~BRPCClient();
-
-  VarHandlePtr AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           const std::string& out_var_name,
-                           const std::string& table_name = "",
-                           int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
-                                    const platform::DeviceContext& ctx,
-                                    const framework::Scope& scope,
-                                    const std::string& var_name,
-                                    const std::string& out_varname,
-                                    int64_t time_out = FLAGS_rpc_deadline);
-
-  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool Wait() override;
-
-  void SendComplete() override;
-
- private:
-  VarHandlePtr _AsyncGetVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_var_name, const std::string& method_name,
-      const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline);
-
-  void Proceed();
-  ChannelQueuePtr GetChannel(const std::string& ep);
-
-  VarHandlePtr AsyncSendComplete(const std::string& ep,
-                                 int64_t time_out = FLAGS_rpc_deadline);
-
-  VarHandlePtr AsyncSendMessage(const std::string& ep,
-                                const std::string& method_name,
-                                const std::string& message, int64_t time_out);
-
-  VarHandlePtr AsyncSendVarMessage(const std::string& ep,
-                                   const std::string& method_name,
-                                   const sendrecv::VariableMessage& req,
-                                   int64_t time_out);
-
-  friend void HandleSendResponse(brpc::Controller* cntl,
-                                 sendrecv::VoidMessage* response,
-                                 VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                 ChannelContextPtr ch_ctx, BRPCClient* cls);
-
-  friend void HandleGetResponse(brpc::Controller* cntl,
-                                sendrecv::VariableMessage* response,
-                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                ChannelContextPtr ch_ctx, BRPCClient* cls);
-
-  friend void HandleFetchBarrierResponse(brpc::Controller* cntl,
-                                         sendrecv::VariableMessage* response,
-                                         VarHandlePtr var_h,
-                                         ChannelQueuePtr ch_ptr,
-                                         ChannelContextPtr ch_ctx,
-                                         BRPCClient* cls);
-  void DecreaseReqCount() {
-    if (--req_count_ <= 0) {
-      sync_cond_.notify_all();
-    }
-  }
-
- private:
-  std::unordered_map<std::string, ChannelQueuePtr> channels_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-
-  static constexpr int brpc_channel_num_per_server_ = 4;
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(BRPCClient);
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
deleted file mode 100644
index 94f0b9919ace83..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "brpc/channel.h"
-#include "brpc/rdma/rdma_helper.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-RdmaMemPool& RdmaMemPool::Instance() {
-  static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool();
-  return *g_rdma_mem_pool;
-}
-
-void* RdmaMemPool::Find(const std::string& varname, int64_t size) {
-  pthread_rwlock_rdlock(&access_);
-  auto it = pool_.find(varname);
-  if (it == pool_.end()) {
-    pthread_rwlock_unlock(&access_);
-    return nullptr;
-  }
-
-  auto info = it->second;
-  if (info.data_size != size) {
-    pthread_rwlock_unlock(&access_);
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "var:%s size:%ld != %ld", varname, size, info.data_size));
-    return nullptr;
-  }
-
-  pthread_rwlock_unlock(&access_);
-  return info.data;
-}
-
-void RdmaMemPool::Register(const std::string& varname, void* data,
-                           int64_t data_size) {
-  void* old = Find(varname, data_size);
-  if (old != nullptr) {
-    PADDLE_ENFORCE_EQ(
-        data, old, platform::errors::InvalidArgument("var:%s data:%ld != %ld",
-                                                     varname, data, old));
-    VLOG(7) << "Find on rdma:" << varname << " data:" << data
-            << " data_size:" << data_size;
-    return;
-  }
-
-  VarInfo info;
-  info.data = data;
-  info.data_size = data_size;
-
-  pthread_rwlock_wrlock(&access_);
-  pool_[varname] = info;
-  pthread_rwlock_unlock(&access_);
-
-  if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Register memory for RDMA failed. Register %s data: %s data size %d "
-        "error.",
-        varname, data, data_size));
-  }
-
-  VLOG(4) << "register on rdma:" << varname << " data:" << data
-          << " data_size:" << data_size;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
deleted file mode 100644
index 156a93ec578471..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifdef PADDLE_WITH_BRPC_RDMA
-
-#include <pthread.h>  // NOLINT
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-/*
- * This class is used to avoid duplicated registion of brpc::rdma.
- */
-class RdmaMemPool {
- public:
-  static RdmaMemPool& Instance();
-  RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {}
-
-  virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); }
-
-  void Register(const std::string& varname, void* data, int64_t size);
-  void* Find(const std::string& varname, int64_t size);
-
- private:
-  struct VarInfo {
-    void* data;
-    int64_t data_size;
-
-    VarInfo() : data(nullptr), data_size(0) {}
-  };
-
- private:
-  std::unordered_map<std::string, VarInfo> pool_;
-  pthread_rwlock_t access_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
deleted file mode 100644
index 411c0f36debd3b..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
-#ifdef PADDLE_WITH_RCCL
-#include <rccl.h>
-#endif
-#include <sys/time.h>
-#include <limits>
-#include <memory>
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class IOBufWriter {
- public:
-  static void Append(const std::string& varname, butil::IOBuf* iobuf, int k,
-                     const char* v, int64_t vlen) {
-    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
-      PADDDLE_THROW(platform::errors::Unavailable(
-          "Variable lenght is invalid. Variable name is %s, length is %d.",
-          varname, vlen));
-    }
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-    iobuf->append(v, vlen);
-  }
-
-  static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v,
-                                int64_t vlen, bool in_cuda_pinned,
-                                void (*destroy)(void*), void* user_data) {
-    VLOG(7) << "AppendTCPZeroCopy "
-            << " k:" << k
-            << " data:" << static_cast<void*>(const_cast<char*>(v))
-            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-
-    // FIXME(gongwb): use append_zerocopy
-    /*
-    if (in_cuda_pinned) {
-      iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory);
-    } else {
-      iobuf->append_zerocopy(v, vlen, nullptr);
-    }
-    */
-    iobuf->append(v, vlen);
-    destroy(user_data);
-  }
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-  static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf,
-                                 int k, const char* v, int64_t vlen,
-                                 bool in_cuda_pinned, void (*destroy)(void*),
-                                 void* user_data) {
-    VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k
-            << " data:" << static_cast<void*>(const_cast<char*>(v))
-            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-
-    RdmaMemPool::Instance().Register(
-        varname, static_cast<void*>(const_cast<char*>(v)), vlen);
-
-    // FIXME(gongwb): use append_zerocopy
-    // iobuf->append_zerocopy(v, vlen, nullptr);
-    iobuf->append(v, vlen);
-    destroy(user_data);
-    return;
-  }
-#endif
-
-  static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf,
-                             int k, const char* v, int64_t vlen,
-                             bool in_cuda_pinned, void (*destroy)(void*),
-                             void* user_data) {
-    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
-      PADDDLE_THROW(platform::errors::Unavailable(
-          "Variable lenght is invalid. Variable name is %s, length is %d.",
-          varname, vlen));
-    }
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-    IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
-                                    destroy, user_data);
-#else
-    IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy,
-                                   user_data);
-#endif
-  }
-};
-
-void SerializeToIOBuf(const std::string& name, framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      butil::IOBuf* iobuf, const std::string& out_varname,
-                      bool var_is_not_stable, int trainer_id,
-                      const std::string& table_name) {
-  std::unique_ptr<TensorPayload> payload;
-
-  request->set_varname(name);
-  request->set_trainer_id(trainer_id);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request->set_profile(platform::kEnableProfiler);
-    } else {
-      request->set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_varname.empty()) {
-    request->set_out_varname(out_varname);
-  }
-  if (!table_name.empty()) {
-    request->set_table_name(table_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request->set_type(::sendrecv::LOD_TENSOR);
-    payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request)));
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request->set_type(::sendrecv::SELECTED_ROWS);
-    payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  } else if (var->IsType<ncclUniqueId>()) {
-    request->set_type(::sendrecv::NCCL_ID);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    // TODO(gongwb): use append_zero to avoid data copy.
-    IOBufWriter::Append(name, iobuf,
-                        sendrecv::VariableMessage::kSerializedFieldNumber,
-                        uid.internal, NCCL_UNIQUE_ID_BYTES);
-    return;
-#endif
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Serialize does not support type: %s", typeid(var->Type()).name()));
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(
-      payload,
-      platform::errors::InvalidArgument(
-          "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS.",
-          var->Type()));
-
-  // FIXME(gongwb): it seems that can use zero copy.
-  if (var_is_not_stable) {
-    IOBufWriter::Append(
-        name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-        static_cast<const char*>(payload->ptr()), payload->memory_size());
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      IOBufWriter::AppendZeroCopy(
-          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-          static_cast<const char*>(payload->ptr()), payload->memory_size(),
-          true, SerializeDestroyCallback, static_cast<void*>(payload.get()));
-      payload.release();
-#endif
-    } else {
-      IOBufWriter::AppendZeroCopy(
-          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-          static_cast<const char*>(payload->ptr()), payload->memory_size(),
-          false, SerializeDestroyCallback, static_cast<void*>(payload.get()));
-      payload.release();
-    }
-  }
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(),
-                      platform::errors::InvalidArgument(
-                          "Got wrong type: %s, expect type: int64_t",
-                          VectorElemName(slr->rows())));
-    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
-
-    IOBufWriter::Append(name, iobuf,
-                        ::sendrecv::VariableMessage::kRowsFieldNumber,
-                        reinterpret_cast<const char*>(slr->rows().data()),
-                        static_cast<int64_t>(rows_memory_size));
-  }
-}
-
-void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta,
-                          const butil::IOBuf& iobuf,
-                          const platform::DeviceContext& ctx,
-                          const framework::Scope* scope,
-                          framework::Variable** var, int* trainer_id) {
-  operators::distributed::BRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(iobuf, meta), 0,
-      platform::errors::InvalidArgument("parse iobuf to tensor error!"));
-  *var = resp.GetVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
deleted file mode 100644
index a5bdc331eb29c7..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sys/time.h>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void SerializeToIOBuf(const std::string& name, framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      butil::IOBuf* iobuf, const std::string& out_varname,
-                      bool var_is_not_stable, const int trainer_id = 0,
-                      const std::string& table_name = std::string());
-
-void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf,
-                          const platform::DeviceContext& ctx,
-                          const framework::Scope* scope,
-                          framework::Variable** var, int* trainer_id);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
deleted file mode 100644
index bcf20ad076b11f..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "brpc/channel.h"
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  butil::IOBuf iobuf;
-  sendrecv::VariableMessage msg;
-  int tensor_numel = 564 * 128;
-
-  // serialize var to IOBuf
-  {
-    framework::Variable var;
-    auto* slr = var.GetMutable<framework::SelectedRows>();
-    slr->set_height(1000);
-    auto* tensor = slr->mutable_value();
-    auto* rows = slr->mutable_rows();
-    tensor->Resize(framework::make_ddim({564, 128}));
-    tensor->mutable_data<float>(place);
-    math::set_constant(ctx, tensor, 32.7);
-    for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
-                                             "", false);
-  }
-
-  // desrialize
-  {
-    framework::Scope scope;
-    scope.Var("myvar");
-    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
-    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
-
-    framework::Variable* var2 = resp.GetVar();
-
-    auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-    auto* tensor2 = slr2->mutable_value();
-    auto* rows2 = slr2->mutable_rows();
-    float* tensor_data2 = nullptr;
-    framework::Tensor tmp_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      platform::CPUPlace cpu;
-      framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-      tensor_data2 = tmp_tensor.data<float>();
-    } else {
-      tensor_data2 = const_cast<float*>(tensor2->data<float>());
-    }
-    const int64_t* rows_data2 = rows2->data();
-
-    for (int i = 0; i < tensor_numel; ++i) {
-      EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-    }
-    for (size_t i = 0; i < rows2->size(); ++i) {
-      EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-    }
-    EXPECT_EQ(slr2->height(), 1000);
-  }
-}
-
-void RunTestLodTensor(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  butil::IOBuf iobuf;
-  sendrecv::VariableMessage msg;
-  int tensor_numel = 512 * 8 * 4 * 2;
-  {
-    framework::Variable var;
-    auto* tensor = var.GetMutable<framework::LoDTensor>();
-    tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-    framework::LoD lod;
-    lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-    tensor->set_lod(lod);
-    tensor->mutable_data<float>(place);
-    math::set_constant(ctx, tensor, 31.9);
-
-    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
-                                             "", false);
-  }
-
-  // check sendrecv::VariableMessage meta data
-  {
-    EXPECT_EQ(msg.varname(), "myvar");
-    EXPECT_EQ(msg.type(), 0);
-    EXPECT_EQ(msg.dims()[0], 512);
-    EXPECT_EQ(msg.dims()[1], 8);
-    EXPECT_EQ(msg.dims()[2], 4);
-    EXPECT_EQ(msg.dims()[3], 2);
-    EXPECT_EQ(msg.lod_level(), 1);
-    EXPECT_EQ(msg.lod(0).lod_data(0), 1);
-    EXPECT_EQ(msg.lod(0).lod_data(1), 3);
-    EXPECT_EQ(msg.lod(0).lod_data(2), 8);
-  }
-
-  // deserialize
-  {
-    framework::Scope scope;
-    scope.Var("myvar");
-    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
-    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
-
-    framework::Variable* var2 = resp.GetVar();
-
-    auto tensor2 = var2->Get<framework::LoDTensor>();
-    float* tensor_data2 = nullptr;
-    framework::Tensor tmp_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      platform::CPUPlace cpu;
-      framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-      tensor_data2 = tmp_tensor.data<float>();
-    } else {
-      tensor_data2 = const_cast<float*>(tensor2.data<float>());
-    }
-
-    for (int i = 0; i < tensor_numel; ++i)
-      EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-  }
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
deleted file mode 100644
index 5ca26f006bf20e..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ /dev/null
@@ -1,417 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
-#include <memory>
-#include <unordered_map>
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace sendrecv {
-
-namespace distributed = paddle::operators::distributed;
-
-typedef std::unordered_map<std::string, distributed::RequestHandler*>
-    HandlerMap;
-
-class BRPCServiceImpl : public SendRecvService {
- public:
-  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map,
-                           distributed::RPCServer* rpc_server)
-      : rpc_server_(rpc_server) {
-    VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size();
-    auto it = rpc_call_map.find(distributed::kRequestSend);
-    if (it != rpc_call_map.end()) {
-      request_send_h_ = it->second;
-      send_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestSend)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGet);
-    if (it != rpc_call_map.end()) {
-      request_get_h_ = it->second;
-      get_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestGet)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
-    if (it != rpc_call_map.end()) {
-      request_getnobarrier_h_ = it->second;
-      getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestPrefetch);
-    if (it != rpc_call_map.end()) {
-      request_prefetch_h_ = it->second;
-      prefetch_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestCheckpoint);
-    if (it != rpc_call_map.end()) {
-      request_checkpoint_h_ = it->second;
-      checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetMonomerVariable);
-    if (it != rpc_call_map.end()) {
-      request_get_monomer_handler_h_ = it->second;
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier);
-    if (it != rpc_call_map.end()) {
-      request_get_monomer_barrier_handler_h_ = it->second;
-    }
-  }
-
-  virtual ~BRPCServiceImpl() {}
-  void SendVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VoidMessage* response,
-                    google::protobuf::Closure* done) override {
-    send_threads_->Run(
-        [=] { _SendVariable(cntl_butil, request, response, done); });
-  }
-
-  void _SendVariable(google::protobuf::RpcController* cntl_butil,
-                     const VariableMessage* request, VoidMessage* response,
-                     google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_send_h_, platform::errors::PreconditionNotMet(
-                             "RequestSend handler should be registed first!"));
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    VLOG(3) << "RequestSend var_name:" << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    distributed::BRPCVariableResponse resp(request_send_h_->scope(),
-                                           request_send_h_->dev_ctx(),
-                                           request_send_h_->distributed_mode());
-    PADDLE_ENFORCE_EQ(
-        resp.Parse(cntl->request_attachment(), *request), 0,
-        platform::errors::InvalidArgument("parse iobuf to tensor error!"));
-
-    auto scope = resp.GetMutableLocalScope();
-    auto invar = resp.GetVar();
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id);
-  }
-
-  void GetVariable(google::protobuf::RpcController* cntl_butil,
-                   const VariableMessage* request, VariableMessage* response,
-                   google::protobuf::Closure* done) override {
-    get_threads_->Run(
-        [=] { _GetVariable(cntl_butil, request, response, done); });
-  }
-
-  void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
-                            const VariableMessage* request,
-                            VariableMessage* response,
-                            google::protobuf::Closure* done) override {
-    getnobarrier_threads_->Run(
-        [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
-  }
-
-  void _GetVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VariableMessage* response,
-                    google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_h_, platform::errors::PreconditionNotMet(
-                            "RequestGet handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    std::string out_varname = request->out_varname();
-    VLOG(3) << "RequestGet varname:" << varname
-            << ", out_varname:" << out_varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    auto scope = request_get_h_->scope();
-    paddle::framework::Variable* invar = nullptr;
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
-                           out_varname);
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(out_varname, outvar,
-                                    *request_get_h_->dev_ctx(), response,
-                                    &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
-                             const VariableMessage* request,
-                             VariableMessage* response,
-                             google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_getnobarrier_h_,
-        platform::errors::PreconditionNotMet(
-            "RequestGetNoBarrier handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    std::string out_varname = request->out_varname();
-    int trainer_id = request->trainer_id();
-
-    VLOG(3) << "RequestGetNoBarrier varname:" << varname
-            << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
-            << ", from:" << cntl->remote_side();
-
-    auto scope = request_getnobarrier_h_->scope();
-    paddle::framework::Variable* invar = nullptr;
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
-                                    out_varname);
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(
-          out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
-          &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request,
-                        VariableMessage* response,
-                        google::protobuf::Closure* done) override {
-    prefetch_threads_->Run(
-        [=] { _PrefetchVariable(cntl_butil, request, response, done); });
-  }
-
-  void _PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request,
-                         VariableMessage* response,
-                         google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(request_prefetch_h_,
-                   platform::errors::PreconditionNotMet(
-                       "kRequestPrefetch handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    // prefetch process...
-    std::string in_var_name = request->varname();
-    std::string out_var_name = request->out_varname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
-            << ", out_var_name: " << out_var_name
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    distributed::BRPCVariableResponse resp(
-        request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true);
-
-    PADDLE_ENFORCE_EQ(resp.Parse(cntl->request_attachment(), *request), 0,
-                   platform::errors::InvalidArgument(
-                       "parse iobuf to tensor error!"));
-
-    auto scope = resp.GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    std::string table_name = request->table_name();
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                                out_var_name, table_name);
-
-    distributed::SerializeToIOBuf(out_var_name, outvar,
-                                  *request_prefetch_h_->dev_ctx(), response,
-                                  &cntl->response_attachment(), "", true);
-  }
-
-  void CheckpointNotify(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request, VoidMessage* response,
-                        google::protobuf::Closure* done) override {
-    checkpoint_notify_threads_->Run(
-        [=] { _CheckpointNotify(cntl_butil, request, response, done); });
-  }
-
-  void _CheckpointNotify(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request, VoidMessage* response,
-                         google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_checkpoint_h_,
-        platform::errors::PreconditionNotMet(
-            "kRequestCheckpointNotify handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(),
-                                           request_checkpoint_h_->dev_ctx());
-
-    auto scope = resp.GetMutableLocalScope();
-
-    std::string checkpoint_notify = request->varname();
-    std::string checkpoint_dir = request->out_varname();
-    int trainer_id = request->trainer_id();
-
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                                  trainer_id, checkpoint_dir);
-  }
-
-  void GetMonomerVariable(google::protobuf::RpcController* cntl_butil,
-                          const VariableMessage* request,
-                          VariableMessage* response,
-                          google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_monomer_handler_h_,
-        platform::errors::PreconditionNotMet(
-            "kRequestGetMonomerVariable handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    // proc request.
-    std::string varname = request->varname();
-    VLOG(3) << "GetMonomerVariable " << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    rpc_server_->WaitVarCond(varname);
-    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    auto scope = h.scope_;
-    auto invar = scope->FindVar(varname);
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar,
-                                           request->trainer_id());
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response,
-                                    &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request, VoidMessage* response,
-                         google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_monomer_barrier_handler_h_,
-        platform::errors::PreconditionNotMet(
-            "RequestGetMonomerBarrier handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    rpc_server_->WaitVarCond(varname);
-    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    paddle::framework::Scope* scope = nullptr;
-    paddle::framework::Variable* invar = nullptr;
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_monomer_barrier_handler_h_->Handle(
-        varname, scope, invar, &outvar, request->trainer_id());
-  }
-
- private:
-  distributed::RequestHandler* request_send_h_{nullptr};
-  distributed::RequestHandler* request_get_h_{nullptr};
-  distributed::RequestHandler* request_getnobarrier_h_{nullptr};
-  distributed::RequestHandler* request_prefetch_h_{nullptr};
-  distributed::RequestHandler* request_checkpoint_h_{nullptr};
-  distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
-  distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr};
-
-  distributed::RPCServer* rpc_server_{nullptr};
-
-  // FIXME(gongwb): brpc should support process one rpc use one threadpool.
-  std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
-};
-}  // namespace sendrecv
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void AsyncBRPCServer::StartServer() {
-  // Instance of your service.
-  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this);
-
-  // Add the service into server. Notice the second parameter, because the
-  // service is put on stack, we don't want server to delete it, otherwise
-  // use brpc::SERVER_OWNS_SERVICE.
-  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
-    PADDDLE_THROW(platform::errors::Unavailable(
-        "Failed to add service into BRPC server."));
-    return;
-  }
-
-  brpc::ServerOptions options;
-#ifdef PADDLE_WITH_BRPC_RDMA
-  options.use_rdma = true;
-#endif
-  options.idle_timeout_sec = idle_timeout_s_;
-  options.max_concurrency = max_concurrency_;
-  if (server_.Start(bind_address_.c_str(), &options) != 0) {
-    PADDDLE_THROW(platform::errors::Unavailable(
-        "Failed to start EchoServer %s.", bind_address_));
-    return;
-  }
-
-  butil::EndPoint ep = server_.listen_address();
-  selected_port_ = ep.port;
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  server_.Join();
-}
-
-void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
-
-void AsyncBRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h
deleted file mode 100644
index 78bbe5adc0813d..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <mutex>               // NOLINT
-#include <string>
-
-#include "brpc/server.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class AsyncBRPCServer final : public RPCServer {
- public:
-  explicit AsyncBRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncBRPCServer() {}
-  void StartServer() override;
-  void WaitServerReady() override;
-
- private:
-  void ShutDownImpl() override;
-
-  brpc::Server server_;
-
-  static constexpr int idle_timeout_s_ = -1;
-  static constexpr int max_concurrency_ = 0;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-  int ready_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
deleted file mode 100644
index 49521e8a77057b..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-namespace pb = ::google::protobuf;
-using vr = ::sendrecv::VariableMessage;
-
-int BRPCVariableResponse::Parse(Source* source) {
-  pb::io::ZeroCopyInputStream* input_stream = source->contents();
-  pb::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (1) {
-    unsigned int tag = 0;
-    if (!input.ReadLittleEndian32(&tag)) {
-      break;
-    }
-
-    uint64_t num_bytes = 0;
-    if (!input.ReadLittleEndian64(&num_bytes)) {
-      break;
-    }
-
-    int field = static_cast<int>(tag);
-    int ret = field == 0 ? -1 : field;
-    switch (field) {
-      case vr::kSerializedFieldNumber: {
-        if (!ProcSerializedField(field, &input, num_bytes)) {
-          return ret;
-        }
-        break;
-      }
-      case vr::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       platform::errors::PreconditionNotMet(
-                           "meta info should be got first!"));
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return ret;
-        }
-        break;
-      }
-      default: {
-        PADDLE_THROW(platform::errors::Unavailable(
-            "not surpported %u fieldnumber", field));
-        return ret;
-      }
-    }
-  }
-
-  return 0;
-}
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
deleted file mode 100644
index 6282f08a725367..00000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class BRPCSourceWrapper : public Source {
- public:
-  explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return &source_;
-  }
-
- private:
-  butil::IOBufAsZeroCopyInputStream source_;
-};
-
-class BRPCVariableResponse : public VariableResponse {
- public:
-  BRPCVariableResponse(const framework::Scope* scope,
-                       const platform::DeviceContext* dev_ctx,
-                       bool create_scope = false)
-      : VariableResponse(scope, dev_ctx, create_scope) {}
-
-  virtual ~BRPCVariableResponse() {}
-
-  // parse attachment from iobuf
-  int Parse(Source* source) override;
-  int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) {
-    BRPCSourceWrapper wrapper(iobuf);
-    return VariableResponse::Parse(&wrapper, meta);
-  }
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc
deleted file mode 100644
index fcd3e6abead510..00000000000000
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include <memory>
-#include "gflags/gflags.h"
-
-DECLARE_int32(rpc_deadline);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-std::once_flag CollectiveClient::init_flag_;
-std::unique_ptr<CollectiveClient> CollectiveClient::client_(nullptr);
-
-bool CollectiveClient::Gather(const std::vector<RemoteVar>& remote_vars,
-                              std::vector<const framework::SelectedRows*>* dst,
-                              const platform::DeviceContext& ctx,
-                              framework::Scope* scope, int64_t time_out) {
-  for (auto r : remote_vars) {
-    VLOG(50) << "begin gather from ep:" << r.String();
-    scope->Var(r.var_name_)->GetMutable<framework::SelectedRows>();
-    VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable(
-        r.ep_, ctx, *scope, r.var_name_, time_out);
-  }
-
-  rpc_client_->Wait();
-
-  for (auto r : remote_vars) {
-    auto select_rows =
-        scope->FindVar(r.var_name_)->GetMutable<framework::SelectedRows>();
-    dst->push_back(select_rows);
-
-    VLOG(4) << "gather from ep:" << r.String()
-            << ", select_rows:" << GetSelectedRowsInfo(*select_rows);
-
-    rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_);
-  }
-
-  rpc_client_->Wait();
-  return true;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
deleted file mode 100644
index e7d8bb8df98347..00000000000000
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class SelectedRows;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_int32(rpc_deadline);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) {
-  std::stringstream ss;
-  ss << ", height:" << slr.height() << ", rows:[";
-  for (unsigned int i = 0; i < slr.rows().size(); i++) {
-    if (i != slr.rows().size() - 1) {
-      ss << slr.rows()[i] << ",";
-    } else {
-      ss << slr.rows()[i];
-    }
-  }
-  ss << "], dims:" << slr.value().dims();
-  return ss.str();
-}
-
-struct RemoteVar {
-  std::string ep_;
-  std::string var_name_;
-  int trainer_id_{0};
-
-  std::string String() {
-    std::stringstream ss;
-    ss << "ep:" << ep_ << ", var_name:" << var_name_
-       << ", trainer_id:" << trainer_id_;
-
-    return ss.str();
-  }
-};
-
-class CollectiveClient {
- public:
-  CollectiveClient() {
-    rpc_client_.reset(new RPCCLIENT_T());
-    rpc_client_->InitImpl();
-  }
-  virtual ~CollectiveClient() {}
-
-  // note this function will retain the rank order.
-  bool Gather(const std::vector<RemoteVar>& remote_vars,
-              std::vector<const framework::SelectedRows*>* dst,
-              const platform::DeviceContext& ctx, framework::Scope* scope,
-              int64_t time_out = FLAGS_rpc_deadline);
-
-  static CollectiveClient* GetInstance() {
-    std::call_once(init_flag_, [&]() {
-      if (client_.get() == nullptr) {
-        client_.reset(new CollectiveClient());
-      }
-    });
-    return client_.get();
-  }
-
- private:
-  std::unique_ptr<RPCClient> rpc_client_;
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<CollectiveClient> client_;
-};
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc
deleted file mode 100644
index cdd37742d2d5a5..00000000000000
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed/collective_server.h"
-#include <memory>
-
-DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag CollectiveServer::init_flag_;
-std::shared_ptr<CollectiveServer> CollectiveServer::collective_server_(nullptr);
-
-CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) {
-  VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in;
-  rpc_server_.reset(new RPCSERVER_T(end_point, fan_in));
-}
-
-void CollectiveServer::Stop() {
-  rpc_server_->ShutDown();
-  server_thread_->join();
-  loop_thread_->join();
-}
-
-void CollectiveServer::StartServer() {
-  get_monomer_handler_.reset(new GetMonomerHandler());
-  get_monomer_handler_->SetRPCServer(rpc_server_.get());
-
-  get_barrier_handler_.reset(new GetMonomerBarrierHandler());
-  get_barrier_handler_->SetRPCServer(rpc_server_.get());
-
-  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable,
-                           get_monomer_handler_.get(),
-                           FLAGS_collective_get_thread_num);
-  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier,
-                           get_barrier_handler_.get(), 1);
-
-  server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); }));
-  rpc_server_->WaitServerReady();
-
-  loop_thread_.reset(new std::thread([&]() {
-    while (true) {
-      if (rpc_server_->IsExit()) {
-        LOG(WARNING) << "get exit!rpc_processor break!";
-        break;
-      }
-      sleep(1);
-    }
-    VLOG(1) << "CollectiveServer loop_thread end";
-  }));
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
deleted file mode 100644
index 4964923286094a..00000000000000
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class CollectiveServer;
-
-class GetMonomerHandler final : public RequestHandler {
- public:
-  GetMonomerHandler() : RequestHandler(true) {}
-  virtual ~GetMonomerHandler() {}
-  bool Handle(const std::string& var_name, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override {
-    VLOG(50) << "GetMonomerHandler recv " << var_name;
-
-    *outvar = scope->FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        outvar, platform::errors::NotFound("var: %s is not found.", var_name));
-
-    return true;
-  }
-};
-
-class GetMonomerBarrierHandler final : public RequestHandler {
- public:
-  GetMonomerBarrierHandler() : RequestHandler(true) {}
-  virtual ~GetMonomerBarrierHandler() {}
-  bool Handle(const std::string& var_name, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override {
-    VLOG(50) << "GetMonomerHandler recv " << var_name;
-
-    rpc_server_->IncreaseVarBarrier(var_name);
-
-    return true;
-  }
-};
-
-class CollectiveServer final {
- public:
-  explicit CollectiveServer(const std::string& end_point, int fan_in);
-
-  virtual ~CollectiveServer() {}
-
-  void StartServer();
-
-  static CollectiveServer* GetInstance(const std::string& end_point,
-                                       int fan_in) {
-    std::call_once(init_flag_, [&]() {
-      if (collective_server_.get() == nullptr) {
-        collective_server_.reset(new CollectiveServer(end_point, fan_in));
-        collective_server_->StartServer();
-      }
-    });
-
-    return collective_server_.get();
-  }
-
-  std::shared_ptr<RPCServer> GetRPCServer() { return rpc_server_; }
-
-  void Stop();
-
- private:
-  std::unique_ptr<GetMonomerHandler> get_monomer_handler_;
-  std::unique_ptr<GetMonomerBarrierHandler> get_barrier_handler_;
-
-  std::shared_ptr<distributed::RPCServer> rpc_server_;
-  std::shared_ptr<std::thread> server_thread_;
-  std::shared_ptr<std::thread> loop_thread_;
-
-  bool ready_{false};
-
-  static std::once_flag init_flag_;
-  static std::shared_ptr<CollectiveServer> collective_server_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
deleted file mode 100644
index 92b2eb4b51e59f..00000000000000
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <memory>
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include "paddle/fluid/operators/distributed/collective_server.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace distributed = paddle::operators::distributed;
-
-std::unique_ptr<distributed::CollectiveServer> StartServer(
-    const std::string& ep, int fan_in, framework::Scope* scope,
-    platform::DeviceContext* dev_ctx) {
-  distributed::CollectiveServer* server =
-      distributed::CollectiveServer::GetInstance(ep, fan_in);
-
-  auto rpc_server = server->GetRPCServer();
-  rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable,
-                          scope, dev_ctx);
-
-  std::cout << "StartServer return" << std::endl;
-  return std::unique_ptr<distributed::CollectiveServer>(server);
-}
-
-std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  framework::Scope* scope = new framework::Scope();
-  framework::Variable* var = scope->Var("var1");
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->set_height(20000);
-
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-
-  tensor->Resize(framework::make_ddim({3, 1024}));
-  tensor->mutable_data<float>(place);
-
-  paddle::operators::math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 3; ++i) rows->push_back(i);
-
-  std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr);
-
-  return std::unique_ptr<framework::Scope>(scope);
-}
-
-void Gather(const std::vector<distributed::RemoteVar>& vars,
-            platform::DeviceContext* dev_ctx) {
-  distributed::CollectiveClient* client =
-      distributed::CollectiveClient::GetInstance();
-
-  framework::Scope* scope = new framework::Scope();
-  framework::Variable* var = scope->Var("var1");
-  var->GetMutable<framework::SelectedRows>();
-
-  std::vector<const framework::SelectedRows*> dst;
-  client->Gather(vars, &dst, *dev_ctx, scope);
-  std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
-  dev_ctx->Wait();
-
-  ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024}));
-  ASSERT_EQ(dst[0]->height(), 20000);
-  ASSERT_EQ(dst[0]->rows().size(), static_cast<size_t>(3));
-  for (int i = 0; i < 3; i++) {
-    ASSERT_EQ(dst[0]->rows()[i], i);
-  }
-
-  std::vector<float> vec;
-  TensorToVector(dst[0]->value(), *dev_ctx, &vec);
-  for (size_t i = 0; i < 3 * 1024; i++) {
-    ASSERT_FLOAT_EQ(vec[i], 32.7);
-  }
-}
-
-TEST(CollectiveServer, GPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-
-  platform::CUDAPlace place;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  std::string ep = "127.0.0.1:7164";
-  auto scope = GenerateVars(place);
-
-  auto* v1 = scope->FindVar("var1");
-  std::cout << "var1:" << v1 << std::endl;
-
-  auto server = StartServer(ep, 2, scope.get(), &ctx);
-  auto rpc_server = server->GetRPCServer();
-
-  distributed::RemoteVar var;
-  var.ep_ = ep;
-  var.var_name_ = "var1";
-  var.trainer_id_ = 0;
-
-  std::vector<distributed::RemoteVar> vars{var};
-  Gather(vars, &ctx);
-  Gather(vars, &ctx);
-
-  std::cout << "begin WaitVarBarrier" << std::endl;
-  rpc_server->WaitVarBarrier("var1");
-  rpc_server->ClearRegisteredVars();
-  server->Stop();
-
-  scope.release();
-  server.release();
-}
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
deleted file mode 100644
index 4ee27a6414698f..00000000000000
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ /dev/null
@@ -1,989 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-#include <paddle/fluid/framework/program_desc.h>
-
-#include <algorithm>
-#include <chrono>  // NOLINT
-#include <map>
-#include <thread>  // NOLINT
-#include <unordered_set>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/split.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using Tree =
-    std::map<std::string, std::map<std::string, std::vector<std::string>>>;
-using RpcCtxMap = operators::distributed::RpcCtxMap;
-
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-
-Communicator::Communicator() {}
-
-std::once_flag Communicator::init_flag_;
-std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
-
-void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                                 const RpcCtxMap &recv_varname_to_ctx,
-                                 Scope *recv_scope) {
-  send_varname_to_ctx_ = std::move(send_varname_to_ctx);
-  recv_varname_to_ctx_ = std::move(recv_varname_to_ctx);
-  recv_scope_ = std::move(recv_scope);
-
-  if (send_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be send, will not start send_thread";
-  } else {
-    send_scope_.reset(new Scope());
-    for (auto &iter : send_varname_to_ctx_) {
-      if (iter.first == STEP_COUNTER && !need_global_step_) continue;
-      send_varname_to_queue_[iter.first] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
-              send_queue_size_);
-    }
-    send_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  if (recv_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be received, will not start recv_thread";
-  } else {
-    recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  InitParams();
-}
-
-void AsyncCommunicator::InitParams() { RecvNoBarrier(); }
-
-AsyncCommunicator::~AsyncCommunicator() {
-  running_ = false;
-  if (main_thread_) main_thread_->join();
-}
-
-void AsyncCommunicator::SendGlobalStep(int batches) {
-  if (!need_global_step_) {
-    return;
-  }
-
-  if (batches == 0) {
-    return;
-  }
-
-  auto &var_name = STEP_COUNTER;
-  auto *out_var = send_scope_->Var(var_name);
-  auto *out_t = out_var->GetMutable<framework::LoDTensor>();
-  auto *data = out_t->mutable_data<int64_t>({1}, platform::CPUPlace());
-  data[0] = static_cast<int64_t>(batches);
-
-  auto &ctx = send_varname_to_ctx_.at(var_name);
-  auto send_functor = distributed::ParameterSend<float>();
-  send_functor(ctx, *send_scope_, true, 1);
-}
-
-void AsyncCommunicator::SendByCommunicator() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(send_varname_to_ctx_.size());
-  VLOG(3) << "run send graph";
-
-  auto before_run_send_graph = GetCurrentUS();
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    auto send_task = [this, &var_name, &var_queue] {
-      VLOG(3) << var_name << " merge and send; ";
-      std::vector<std::shared_ptr<Variable>> vars;
-
-      int merged_var_num = 0;
-      int wait_times = 0;
-      while (merged_var_num < max_merge_var_num_) {
-        if (var_queue->Size() == 0) {
-          VLOG(4) << "wait_times -> " << wait_times;
-          if (wait_times >= send_wait_times_) {
-            break;
-          }
-          std::this_thread::sleep_for(std::chrono::milliseconds(10));
-          wait_times++;
-          continue;
-        } else {
-          wait_times = 0;
-
-          vars.push_back(var_queue->Pop());
-          merged_var_num++;
-        }
-      }
-      auto before_merge = GetCurrentUS();
-      if (var_name == STEP_COUNTER) {
-        SendGlobalStep(merged_var_num);
-        auto after_merge = GetCurrentUS();
-        VLOG(3) << "merge and send " << merged_var_num << " " << var_name
-                << " use time " << after_merge - before_merge;
-        return;
-      }
-
-      auto &ctx = send_varname_to_ctx_.at(var_name);
-
-      MergeVars<float>(var_name, vars, send_scope_.get(), ctx.merge_add);
-      auto after_merge = GetCurrentUS();
-      VLOG(3) << "merge " << merged_var_num << " " << var_name << " use time "
-              << after_merge - before_merge;
-
-      auto send_functor = distributed::ParameterSend<float>();
-      send_functor(ctx, *send_scope_, true, 1);
-      auto after_send = GetCurrentUS();
-      VLOG(3) << "send " << var_name << " use time "
-              << after_send - after_merge;
-
-      if (var_name.rfind("@GRAD") != var_name.size() - 5) return;
-
-      auto recv_param = var_name.substr(0, var_name.size() - 5);
-      if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end())
-        return;
-
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_);
-      auto after_recv = GetCurrentUS();
-      VLOG(3) << "recv " << recv_param << " use time "
-              << after_recv - after_send;
-    };
-    task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task)));
-  }
-  for (auto &task_f : task_futures) {
-    task_f.wait();
-  }
-  auto after_run_send_graph = GetCurrentUS();
-
-  VLOG(3) << "run send graph use time "
-          << (after_run_send_graph - before_run_send_graph);
-}
-
-void HalfAsyncCommunicator::SendByCommunicator() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(send_varname_to_ctx_.size());
-  VLOG(3) << "run send graph";
-
-  int batches = BatchesCounter();
-  if (batches <= 0) return;
-
-  auto before_run_send_graph = GetCurrentUS();
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    auto send_task = [this, batches, &var_name, &var_queue] {
-      VLOG(3) << var_name << " merge and send; ";
-      auto before_task = GetCurrentUS();
-      std::vector<std::shared_ptr<Variable>> vars;
-      vars.reserve(batches);
-
-      for (int i = 0; i < batches; ++i) {
-        vars.push_back(var_queue->Pop());
-      }
-
-      if (var_name == STEP_COUNTER) {
-        SendGlobalStep(batches);
-        auto end_task = GetCurrentUS();
-        VLOG(3) << "merge " << batches << " " << var_name << " use time "
-                << end_task - before_task;
-        return;
-      }
-
-      auto &ctx = send_varname_to_ctx_.at(var_name);
-
-      auto before_merge = GetCurrentUS();
-      MergeVars<float>(var_name, vars, send_scope_.get(), ctx.merge_add);
-      auto after_merge = GetCurrentUS();
-      VLOG(3) << "merge " << batches << " " << var_name << " use time "
-              << after_merge - before_merge;
-
-      auto send_functor = distributed::ParameterSend<float>();
-      send_functor(ctx, *send_scope_, true, 1);
-      auto after_send = GetCurrentUS();
-      VLOG(3) << "send " << var_name << " use time "
-              << after_send - before_task;
-
-      if (var_name.rfind("@GRAD") != var_name.size() - 5) return;
-
-      auto recv_param = var_name.substr(0, var_name.size() - 5);
-      if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end())
-        return;
-
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_);
-      auto after_recv = GetCurrentUS();
-      VLOG(3) << "recv " << recv_param << " use time "
-              << after_recv - after_send;
-      return;
-    };
-    task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task)));
-  }
-  for (auto &task_f : task_futures) {
-    task_f.wait();
-  }
-  auto after_run_send_graph = GetCurrentUS();
-
-  VLOG(3) << "run send graph use time "
-          << (after_run_send_graph - before_run_send_graph);
-}
-
-void AsyncCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    SendByCommunicator();
-    BarrierSend();
-  }
-  VLOG(3) << "communicator stopped, send thread exit";
-}
-
-void HalfAsyncCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    SendByCommunicator();
-    BarrierSend();
-    RecvByCommunicator();
-    BarrierRecv();
-    BarrierWeakUp();
-  }
-  VLOG(3) << "communicator stopped, send thread exit";
-}
-
-void AsyncCommunicator::RecvByCommunicator() {
-  VLOG(3) << "parallel run recv graph";
-  if (!running_) return;
-  RecvNoBarrier();
-  VLOG(3) << "run recv graph use time";
-}
-
-void AsyncCommunicator::RecvNoBarrier() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(recv_varname_to_ctx_.size());
-
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto recv_task = [this, &iter] {
-      auto before_task = GetCurrentUS();
-      auto &var_name = iter.first;
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(iter.second, *recv_scope_);
-      auto end_task = GetCurrentUS();
-      VLOG(1) << "recv var " << var_name << " use time "
-              << (end_task - before_task);
-    };
-    task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
-  }
-
-  for (auto &task : task_futures) {
-    task.wait();
-  }
-}
-
-void AsyncCommunicator::Start() {
-  VLOG(3) << "Communicator start";
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    VLOG(3) << "start send thread and recv thread";
-    waiting_ = true;
-    running_ = true;
-    BarrierTriggerReset(max_merge_var_num_);
-    // start send and recv thread
-    main_thread_.reset(
-        new std::thread(std::bind(&AsyncCommunicator::MainThread, this)));
-  }
-}
-
-void AsyncCommunicator::Stop() {
-  VLOG(3) << "Communicator stop";
-  running_ = false;
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    if (main_thread_) {
-      VLOG(3) << "stop send thread";
-      main_thread_->join();
-      main_thread_.reset(nullptr);
-    }
-  }
-  VLOG(3) << "Communicator stop done";
-}
-
-void AsyncCommunicator::Send(const std::vector<std::string> &var_names,
-                             const std::vector<std::string> &var_tables,
-                             const framework::Scope &scope) {
-  waiting_ = false;
-
-  PADDLE_ENFORCE_EQ(
-      var_tables.size(), 1,
-      platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
-
-  auto table_name = var_tables[0];
-
-  if (table_name == STEP_COUNTER && !need_global_step_) return;
-
-  auto before_send_op = GetCurrentUS();
-  auto &queue = send_varname_to_queue_.at(table_name);
-
-  if (table_name == STEP_COUNTER) {
-    auto tmp_var = std::make_shared<Variable>();
-    auto *tensor = tmp_var->GetMutable<framework::LoDTensor>();
-    tensor->Resize(framework::make_ddim({1}));
-    auto *out_d = tensor->mutable_data<int64_t>(platform::CPUPlace());
-    out_d[0] = 1;
-    queue->Push(tmp_var);
-  } else {
-    PADDLE_ENFORCE_GE(var_names.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "var_names.size() >= 1 is permitted"));
-
-    auto *var = scope.FindVar(var_names[0]);
-
-    PADDLE_ENFORCE_EQ(
-        var->IsInitialized(), true,
-        platform::errors::InvalidArgument("grad var should be inited"));
-
-    auto tmp_var = std::make_shared<Variable>();
-    if (var->IsType<framework::SelectedRows>()) {
-      framework::CopyVariable(*var, tmp_var.get());
-      queue->Push(tmp_var);
-    } else if (var->IsType<framework::LoDTensor>()) {
-      // push var into send queue by var_name
-      auto var_name = var_names[0];
-      framework::CopyVariable(*var, tmp_var.get());
-      queue->Push(tmp_var);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "unknown var type to copy, only support LoDTensor/SelectedRows"));
-    }
-  }
-  auto after_send_op = GetCurrentUS();
-  VLOG(3) << "send to " << table_name << " with queue size " << queue->Size()
-          << ", use time " << (after_send_op - before_send_op);
-}
-
-void HalfAsyncCommunicator::Clean() {
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    while (var_queue->Size() > 0) {
-      var_queue->Pop();
-    }
-
-    VLOG(3) << "clean var: " << var_name << " done";
-  }
-}
-
-int HalfAsyncCommunicator::BatchesCounter() {
-  while (running_) {
-    if (barrier_counter_.load() >= barrier_trigger_.load() &&
-        barrier_trigger_.load() != 0) {
-      break;
-    } else {
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    }
-  }
-
-  return barrier_counter_.load();
-}
-
-void HalfAsyncCommunicator::Barrier() {
-  barrier_counter_++;
-
-  if (!running_) {
-    VLOG(3) << "Communicator is not running, release barrier";
-    return;
-  }
-
-  {
-    std::unique_lock<std::mutex> lk(barrier_mutex_);
-    barrier_cond_.wait(lk, [this] { return (barrier_counter_ == 0); });
-  }
-}
-
-void HalfAsyncCommunicator::BarrierTriggerDecrement() {
-  barrier_trigger_--;
-  VLOG(3) << "BarrierTriggerDecrement decrement barrier trigger to "
-          << barrier_trigger_.load();
-}
-
-void HalfAsyncCommunicator::BarrierTriggerReset(int initial_val) {
-  barrier_trigger_.store(initial_val);
-
-  VLOG(3) << "BarrierTriggerReset reset barrier trigger to "
-          << barrier_trigger_.load();
-}
-
-void HalfAsyncCommunicator::BarrierWeakUp() {
-  barrier_counter_.store(0);
-  barrier_cond_.notify_all();
-}
-
-void SyncCommunicator::BarrierSend() {
-  if (!running_) return;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id_);
-
-  std::vector<distributed::VarHandlePtr> rets;
-
-  for (auto &ep : pserver_endpoints_) {
-    rets.push_back(rpc_client->AsyncSendBatchBarrier(ep));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External(
-                                               "internal error in RPCClient"));
-  }
-
-  VLOG(4) << "BarrierSend with SyncCommunicator";
-}
-
-void SyncCommunicator::BarrierRecv() {
-  if (!running_) return;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id_);
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (auto &ep : pserver_endpoints_) {
-    rets.push_back(rpc_client->AsyncSendFetchBarrier(ep));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External(
-                                               "internal error in RPCClient"));
-  }
-
-  VLOG(4) << "BarrierRecv with SyncCommunicator";
-}
-
-void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                               const RpcCtxMap &recv_varname_to_ctx,
-                               Scope *recv_scope) {
-  send_varname_to_ctx_ = std::move(send_varname_to_ctx);
-  recv_varname_to_ctx_ = std::move(recv_varname_to_ctx);
-  recv_scope_ = std::move(recv_scope);
-
-  PADDLE_ENFORCE_GT(
-      send_varname_to_ctx.size(), 0,
-      platform::errors::InvalidArgument("send var contexts can not be zero"));
-
-  send_scope_.reset(new Scope());
-  for (auto &iter : send_varname_to_ctx_) {
-    auto &varname = iter.first;
-
-    if (varname == STEP_COUNTER) {
-      send_varname_to_queue_[varname] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
-              send_queue_size_);
-    } else {
-      auto &send_ctx = iter.second;
-
-      send_var_nums_ += send_ctx.splited_varnames.size();
-      if (!send_ctx.is_sparse) {
-        continue;
-      }
-      int pserver_num = static_cast<int>(send_ctx.epmap.size());
-      for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) {
-        sparse_id_queues_.insert(
-            std::pair<std::string, std::shared_ptr<BlockingQueue<
-                                       std::shared_ptr<std::vector<int64_t>>>>>(
-                send_ctx.splited_varnames[ep_idx],
-                std::make_shared<
-                    BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>(
-                    send_queue_size_)));
-      }
-    }
-  }
-  send_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-
-  if (recv_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be received, will not start recv_thread";
-  } else {
-    recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  delta_scope_.reset(new Scope());
-  old_scope_.reset(new Scope());
-  pserver_scope_.reset(new Scope());
-
-  InitParams();
-}
-
-void GeoCommunicator::Send(const std::vector<std::string> &var_names,
-                           const std::vector<std::string> &var_tables,
-                           const framework::Scope &scope) {
-  waiting_ = false;
-  PADDLE_ENFORCE_EQ(
-      var_tables.size(), 1,
-      platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
-
-  auto table_name = var_tables[0];
-  if (table_name == STEP_COUNTER) return;
-
-  auto before_send = GetCurrentUS();
-  size_t splited_var_nums =
-      send_varname_to_ctx_[table_name].splited_varnames.size();
-
-  std::unordered_map<std::string, std::unordered_set<int64_t>> ids_table;
-
-  for (size_t j = 0; j < splited_var_nums; j++) {
-    ids_table.insert(std::pair<std::string, std::unordered_set<int64_t>>(
-        send_varname_to_ctx_[table_name].splited_varnames[j],
-        std::unordered_set<int64_t>()));
-  }
-  auto *var = scope.FindVar(var_names[0]);
-  auto &rows = var->Get<framework::SelectedRows>().rows();
-
-  // insert ids which has not been record
-  for (size_t j = 0; j < rows.size(); j++) {
-    auto ep_idx = rows[j] % splited_var_nums;
-    ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx])
-        .insert(rows[j]);
-  }
-
-  auto before_push = GetCurrentUS();
-  for (auto &iter : ids_table) {
-    auto &key = iter.first;
-    auto &sparse_ids_set = iter.second;
-    auto sparse_ids_vec = std::make_shared<std::vector<int64_t>>();
-    sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end());
-    sparse_id_queues_.at(key)->Push(sparse_ids_vec);
-    VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key
-            << "'s queue";
-  }
-  auto after_send = GetCurrentUS();
-  VLOG(3) << "run send " << table_name << " op finish. using "
-          << (before_push - before_send) << "; " << (after_send - before_push);
-}
-
-void GeoCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    std::vector<std::future<void>> tasks;
-    tasks.reserve(send_var_nums_);
-
-    for (auto &iter : send_varname_to_ctx_) {
-      auto &var_name = iter.first;
-      auto &send_ctx = iter.second;
-      int pserver_num = static_cast<int>(send_ctx.epmap.size());
-      if (send_ctx.is_sparse) {
-        for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) {
-          auto send_recv_task = [this, ep_idx, &var_name] {
-            auto before_send_sparse = GetCurrentUS();
-            if (var_name == STEP_COUNTER) {
-              return;
-            }
-            auto send_varname =
-                send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx];
-            auto sparse_ids = MergeSparseIds(send_varname);
-            if (sparse_ids.size() == 0) {
-              return;
-            }
-            SendSparse(var_name, ep_idx, sparse_ids);
-            auto after_send_sparse = GetCurrentUS();
-            RecvSparse(var_name, ep_idx);
-            auto after_recv_sparse = GetCurrentUS();
-            VLOG(3)
-                << "send recv "
-                << send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx]
-                << " finish, using " << (after_send_sparse - before_send_sparse)
-                << " and " << (after_recv_sparse - after_send_sparse)
-                << "; total = " << (after_recv_sparse - before_send_sparse);
-          };
-          tasks.emplace_back(
-              send_threadpool_->enqueue(std::move(send_recv_task)));
-        }
-      } else {
-        auto send_recv_task = [this, &var_name, &send_ctx] {
-          if (var_name == STEP_COUNTER) {
-            return;
-          }
-          SendDense(var_name);
-          RecvDense(var_name);
-        };
-        tasks.emplace_back(
-            send_threadpool_->enqueue(std::move(send_recv_task)));
-      }
-    }
-    for (auto &task : tasks) {
-      task.wait();
-    }
-  }
-}
-
-std::vector<int64_t> GeoCommunicator::MergeSparseIds(
-    const std::string &send_varname) {
-  size_t merge_num = 0, wait_times = 0;
-  std::unordered_set<int64_t> sparse_ids;
-  while (merge_num < static_cast<size_t>(max_merge_var_num_)) {
-    VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num;
-    if (sparse_id_queues_.at(send_varname)->Size() > 0) {
-      wait_times = 0;
-      std::shared_ptr<std::vector<int64_t>> pop_ids =
-          sparse_id_queues_.at(send_varname)->Pop();
-      for (size_t j = 0; j < pop_ids->size(); j++) {
-        sparse_ids.insert(pop_ids->at(j));
-      }
-      merge_num += 1;
-      VLOG(3) << "sparse_id_queues_(" << send_varname << ") pushed";
-    } else if (sparse_id_queues_.at(send_varname)->Size() == 0) {
-      VLOG(3) << "wait_times -> " << wait_times;
-      if (wait_times >= static_cast<size_t>(send_wait_times_)) {
-        break;
-      }
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-      wait_times++;
-      continue;
-    }
-  }
-  std::vector<int64_t> res;
-  res.assign(sparse_ids.begin(), sparse_ids.end());
-  return res;
-}
-void GeoCommunicator::SendSparse(const std::string &varname, int ep_idx,
-                                 const std::vector<int64_t> &sparse_ids) {
-  auto &rpc_ctx = send_varname_to_ctx_.at(varname);
-  auto send_varname = rpc_ctx.splited_varnames[ep_idx];
-  auto trainer_id = rpc_ctx.trainer_id;
-  auto endpoint = rpc_ctx.epmap[ep_idx];
-  auto pserver_num = rpc_ctx.epmap.size();
-
-  auto *var_latest = recv_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-  auto &t_latest = var_latest->Get<framework::LoDTensor>();
-
-  auto dims1 = t_latest.dims()[1];
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(send_varname);
-  auto *t_delta = var_delta->GetMutable<framework::SelectedRows>();
-
-  auto *t_value = t_delta->mutable_value();
-  t_value->mutable_data<float>(
-      framework::make_ddim({static_cast<int64_t>(sparse_ids.size()), dims1}),
-      cpu_ctx.GetPlace());
-
-  std::vector<std::vector<std::vector<float> *>> values;
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Get(sparse_ids, {"Param"}, &values);
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  float coefficient = 1.0 / static_cast<float>(trainers_);
-
-  for (auto j = 0; j < static_cast<int>(sparse_ids.size()); ++j) {
-    blas.VSUB(dims1, t_latest.data<float>() + sparse_ids[j] * dims1,
-              values[j][0]->data(), t_value->data<float>() + j * dims1);
-    blas.SCAL(dims1, coefficient, t_value->data<float>() + j * dims1);
-    blas.VADD(dims1, values[j][0]->data(), t_value->data<float>() + j * dims1,
-              values[j][0]->data());
-  }
-
-  std::vector<int64_t> send_rows;
-  send_rows.reserve(sparse_ids.size());
-  for (auto idx : sparse_ids) {
-    send_rows.push_back(idx / pserver_num);
-  }
-  t_delta->set_height(rpc_ctx.height_sections[ep_idx]);
-  t_delta->set_rows(send_rows);
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx_send = *pool.Get(platform::CPUPlace());
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-  auto ret = rpc_client->AsyncSendVar(endpoint, cpu_ctx_send,
-                                      *delta_scope_.get(), send_varname);
-  ret->Wait();
-}
-
-void GeoCommunicator::SendDense(const std::string &varname) {
-  auto *var_latest = recv_scope_->FindVar(varname);
-  auto *var_timestamp = old_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-  PADDLE_ENFORCE_EQ(var_timestamp->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-
-  auto &t_latest = var_latest->Get<framework::LoDTensor>();
-  auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(varname);
-  auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
-  t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  blas.VSUB(t_latest.numel(), t_latest.data<float>(),
-            t_timestamp->data<float>(), t_delta->data<float>());
-
-  float coefficient = 1.0 / static_cast<float>(trainers_);
-  blas.SCAL(t_latest.numel(), coefficient, t_delta->data<float>());
-
-  blas.VADD(t_latest.numel(), t_timestamp->data<float>(),
-            t_delta->data<float>(), t_timestamp->data<float>());
-
-  auto &ctx = send_varname_to_ctx_.at(varname);
-  auto send = distributed::ParameterSend<float>();
-  send(ctx, *delta_scope_, true, 1);
-}
-
-void GeoCommunicator::RecvByCommunicator() { return; }
-
-void GeoCommunicator::RecvSparse(const std::string &varname, int ep_idx) {
-  auto train_id = recv_varname_to_ctx_.at(varname).trainer_id;
-  auto endpoint = recv_varname_to_ctx_.at(varname).epmap[ep_idx];
-  auto splited_var_name =
-      recv_varname_to_ctx_.at(varname).splited_varnames[ep_idx];
-  auto pserver_num = recv_varname_to_ctx_.at(varname).epmap.size();
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx_recv = *pool.Get(platform::CPUPlace());
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(train_id);
-
-  auto *var_psrever = pserver_scope_->Var(splited_var_name);
-  auto handle = rpc_client->AsyncGetVar(endpoint, cpu_ctx_recv,
-                                        *pserver_scope_.get(), splited_var_name,
-                                        splited_var_name, splited_var_name);
-  handle->Wait();
-
-  auto *var_latest = recv_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(
-      var_psrever->IsInitialized(), true,
-      platform::errors::Unavailable(
-          "%s in pserver scope is not initialized, please check", varname));
-
-  std::vector<int64_t> ids;
-  ids.assign(var_psrever->Get<framework::SelectedRows>().rows().begin(),
-             var_psrever->Get<framework::SelectedRows>().rows().end());
-
-  for (size_t j = 0; j < ids.size(); j++) {
-    ids[j] = ids[j] * pserver_num + ep_idx;
-  }
-
-  VLOG(3) << "RecvSparse receive var: " << splited_var_name
-          << " ids Size: " << ids.size();
-
-  auto t_psrever = var_psrever->Get<framework::SelectedRows>().value();
-
-  std::vector<std::vector<std::vector<float> *>> old_values;
-
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Get(ids, {"Param"}, &old_values);
-
-  auto *t_latest = var_latest->GetMutable<framework::LoDTensor>();
-
-  auto dims1 = t_latest->dims()[1];
-  auto numel = ids.size() * dims1;
-
-  std::vector<float> v_delta;
-  v_delta.resize(numel);
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-
-  for (auto j = 0; j < static_cast<int>(ids.size()); ++j) {
-    blas.VSUB(dims1, t_psrever.data<float>() + j * dims1,
-              old_values[j][0]->data(), v_delta.data() + j * dims1);
-    blas.VADD(dims1, t_latest->data<float>() + ids[j] * dims1,
-              v_delta.data() + j * dims1,
-              t_latest->data<float>() + ids[j] * dims1);
-    blas.VCOPY(dims1, t_psrever.data<float>() + j * dims1,
-               old_values[j][0]->data());
-  }
-}
-
-void GeoCommunicator::RecvDense(const std::string &varname) {
-  auto *var_latest = recv_scope_->FindVar(varname);
-  auto *var_timestamp = old_scope_->FindVar(varname);
-  auto *var_psrever = pserver_scope_->Var(varname);
-
-  auto &ctx = recv_varname_to_ctx_.at(varname);
-  auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *pserver_scope_);
-
-  PADDLE_ENFORCE_EQ(
-      var_psrever->IsInitialized(), true,
-      platform::errors::Unavailable(
-          "%s in pserver scope is not initialized, please check", varname));
-
-  auto t_psrever = var_psrever->Get<framework::LoDTensor>();
-  auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
-  auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(varname);
-  auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
-  t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  blas.VSUB(t_latest->numel(), t_psrever.data<float>(),
-            t_timestamp->data<float>(), t_delta->data<float>());
-  blas.VADD(t_latest->numel(), t_latest->data<float>(), t_delta->data<float>(),
-            t_latest->data<float>());
-  blas.VCOPY(t_latest->numel(), t_psrever.data<float>(),
-             t_timestamp->data<float>());
-}
-
-void GeoCommunicator::InitParams() {
-  std::vector<std::future<void>> tasks;
-  tasks.reserve(recv_varname_to_ctx_.size());
-
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto &var_name = iter.first;
-    auto &recv_ctx = iter.second;
-
-    auto recv_task = [this, &var_name, &recv_ctx] {
-      if (!recv_ctx.is_sparse) {
-        InitDense(var_name);
-      }
-    };
-    tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task)));
-  }
-
-  for (auto &task : tasks) {
-    task.wait();
-  }
-  InitSparse();
-}
-
-void GeoCommunicator::InitDense(const std::string varname) {
-  auto &ctx = recv_varname_to_ctx_.at(varname);
-  auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *recv_scope_);
-
-  auto *global_var = recv_scope_->FindVar(varname);
-  global_var->GetMutable<framework::LoDTensor>();
-
-  auto *old_var = old_scope_->Var(varname);
-  old_var->GetMutable<framework::LoDTensor>();
-
-  framework::CopyVariable(*global_var, old_var);
-  VLOG(1) << "init dense variable " << varname << " done";
-}
-
-void GeoCommunicator::InitSparse() {
-  auto sparse_metas = string::split_string<std::string>(sparse_attrs_, "#");
-
-  std::vector<distributed::SparseMeta> metas;
-  std::vector<int64_t> dicts;
-
-  for (auto &sparse_meta : sparse_metas) {
-    auto attrs = string::split_string<std::string>(sparse_meta, ":");
-
-    auto meta = distributed::SparseMeta();
-    meta.name = attrs[0];
-    meta.value_names = {"Param"};
-
-    auto dic = string::split_string<std::string>(attrs[1], ",");
-    dicts.push_back(std::stoi(dic[0]));
-    meta.value_dims = {std::stoi(dic[1])};
-    meta.mode = distributed::Mode::training;
-    meta.grad_name = "none";
-    meta.cached_varnames = {};
-    meta.initializer_attrs = string::split_string<std::string>(attrs[2]);
-    meta.entry = "none";
-
-    VLOG(3) << "add sparse meta: " << meta.ToString();
-    metas.push_back(meta);
-  }
-
-  LargeScaleKV::Init(metas);
-
-  for (auto &meta : metas) {
-    auto &ctx = recv_varname_to_ctx_.at(meta.name);
-    auto recv = distributed::ParameterRecv<float>();
-
-    auto *global_var = recv_scope_->FindVar(meta.name);
-    auto global_value = global_var->Get<framework::LoDTensor>();
-    auto rows = global_value.dims()[0];
-    auto dim1 = global_value.dims()[1];
-
-    recv(ctx, *recv_scope_);
-    VLOG(1) << "recv " << meta.name << " with global scope for init";
-
-    auto n_rows = global_var->Get<framework::LoDTensor>().dims()[0];
-
-    PADDLE_ENFORCE_EQ(
-        rows, n_rows,
-        platform::errors::InvalidArgument(
-            "global var: %s origin dim must equal recved rows", meta.name));
-
-    std::vector<int64_t> ids(rows);
-    std::iota(ids.begin(), ids.end(), 0);
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    std::vector<std::vector<std::vector<float> *>> values;
-
-    ins->Get(meta.name)->Init(ids);
-    ins->Get(meta.name)->Get(ids, {"Param"}, &values);
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, float>(
-        paddle::platform::CPUDeviceContext());
-
-    for (auto &id : ids) {
-      blas.VCOPY(dim1, global_value.data<float>() + id * dim1,
-                 values[id][0]->data());
-    }
-  }
-
-  VLOG(3) << "init sparse variable done";
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
deleted file mode 100644
index 4be3253d3923f8..00000000000000
--- a/paddle/fluid/operators/distributed/communicator.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <stdint.h>
-#include <atomic>
-#include <deque>
-#include <map>
-#include <memory>
-#include <numeric>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_bool(communicator_is_sgd_optimizer);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-
-template <typename T>
-class BlockingQueue {
- public:
-  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {
-    PADDLE_ENFORCE_GT(capacity_, 0,
-                      platform::errors::InvalidArgument(
-                          "The capacity must be greater than 0."));
-  }
-
-  bool Push(const T &elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      PADDLE_ENFORCE_LT(
-          queue_.size(), capacity_,
-          platform::errors::OutOfRange("The queue size: %s out of capacity:%s",
-                                       queue_.size(), capacity_));
-      queue_.push_back(elem);
-    }
-    cv_.notify_one();
-    return true;
-  }
-
-  bool Push(T &&elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      PADDLE_ENFORCE_LT(
-          queue_.size(), capacity_,
-          platform::errors::OutOfRange("The queue size: %s out of capacity:%s",
-                                       queue_.size(), capacity_));
-      queue_.emplace_back(std::move(elem));
-    }
-    cv_.notify_one();
-    return true;
-  }
-
-  T Pop() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [=] { return !queue_.empty(); });
-    T rc(std::move(queue_.front()));
-    queue_.pop_front();
-    cv_.notify_one();
-    return rc;
-  }
-
-  size_t Cap() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return capacity_;
-  }
-
-  size_t Size() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return queue_.size();
-  }
-
- private:
-  const size_t capacity_;
-  std::deque<T> queue_;
-
-  mutable std::mutex mutex_;
-  std::condition_variable cv_;
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-inline void MergeVars(const std::string &var_name,
-                      const std::vector<std::shared_ptr<Variable>> &vars,
-                      Scope *scope, bool merge_add = true) {
-  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
-                                            "vector vars are empty."));
-  auto cpu_place = platform::CPUPlace();
-  auto &var0 = vars[0];
-  auto *out_var = scope->Var(var_name);
-  if (var0->IsType<framework::LoDTensor>()) {
-    auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims
-            << "; merge add: " << merge_add;
-    // init output tensor
-    auto *out_t = out_var->GetMutable<framework::LoDTensor>();
-    out_t->mutable_data<T>(dims, cpu_place);
-    // check the input dims
-    for (auto &var : vars) {
-      auto &var_t = var->Get<framework::LoDTensor>();
-      PADDLE_ENFORCE_EQ(
-          var_t.dims(), dims,
-          platform::errors::InvalidArgument("vars should have the same dims"));
-    }
-
-    // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
-    math::SetConstant<paddle::platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
-    // sum all vars to out
-    auto result = EigenVector<T>::Flatten(*out_t);
-    for (auto &var : vars) {
-      auto &in_t = var->Get<framework::LoDTensor>();
-      auto in = EigenVector<T>::Flatten(in_t);
-      result.device(*cpu_ctx.eigen_device()) = result + in;
-    }
-    if (!merge_add) {
-      result.device(*cpu_ctx.eigen_device()) =
-          result / static_cast<T>(vars.size());
-    }
-  } else if (var0->IsType<framework::SelectedRows>()) {
-    auto &slr0 = var0->Get<framework::SelectedRows>();
-    auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
-    out_slr->mutable_rows()->clear();
-    out_slr->mutable_value()->mutable_data<T>({{}}, cpu_place);
-    std::vector<const paddle::framework::SelectedRows *> inputs;
-    inputs.reserve(vars.size());
-    for (auto &var : vars) {
-      inputs.push_back(&var->Get<framework::SelectedRows>());
-    }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
-    if (merge_add) {
-      math::scatter::MergeAdd<paddle::platform::CPUDeviceContext, T> merge_add;
-      merge_add(dev_ctx, inputs, out_slr);
-    } else {
-      math::scatter::MergeAverage<paddle::platform::CPUDeviceContext, T>
-          merge_average;
-      merge_average(dev_ctx, inputs, out_slr);
-    }
-
-    VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
-            << " dims: " << slr0.value().dims() << "; merge add: " << merge_add;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument("unsupported var type: %s!",
-                                                   var0->Type()));
-  }
-}
-
-using RpcCtxMap = std::unordered_map<std::string, CommContext>;
-using SparseValue = std::unordered_map<int64_t, std::vector<float>>;
-
-class Communicator {
- public:
-  Communicator();
-
-  explicit Communicator(const std::map<std::string, std::string> &envs_) {
-    for (auto &iter : envs_) {
-      envs[iter.first] = iter.second;
-    }
-  }
-
-  virtual ~Communicator() {}
-
-  virtual void Start() = 0;
-
-  virtual void Stop() = 0;
-
-  virtual bool IsRunning() { return running_; }
-
-  virtual void Clean() {}
-
-  virtual void Send(const std::vector<std::string> &var_names,
-                    const std::vector<std::string> &var_tables,
-                    const framework::Scope &scope) = 0;
-
-  virtual void RecvNoBarrier() {}
-
-  virtual void Barrier() {}
-
-  virtual void BarrierTriggerDecrement() {}
-
-  virtual void BarrierTriggerReset(int init_counter) {}
-
-  virtual void InitEnvs() = 0;
-
-  virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                        const RpcCtxMap &recv_varname_to_ctx,
-                        Scope *recv_scope) {}
-
-  static Communicator *GetInstance() { return communicator_.get(); }
-
-  static std::shared_ptr<Communicator> GetInstantcePtr() {
-    return communicator_;
-  }
-
-  template <typename T>
-  static Communicator *InitInstance(
-      const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope,
-      const std::map<std::string, std::string> &envs) {
-    std::call_once(init_flag_, &Communicator::InitWithRpcCtx<T>, send_ctx,
-                   recv_ctx, recv_scope, std::ref(envs));
-    return communicator_.get();
-  }
-
-  // Init is called by InitInstance.
-  template <typename T>
-  static void InitWithRpcCtx(const RpcCtxMap &send_ctx,
-                             const RpcCtxMap &recv_ctx, Scope *recv_scope,
-                             const std::map<std::string, std::string> &envs) {
-    if (communicator_.get() == nullptr) {
-      communicator_.reset(new T(std::ref(envs)));
-      communicator_->InitEnvs();
-      communicator_->InitImpl(send_ctx, recv_ctx, recv_scope);
-    }
-  }
-
- protected:
-  bool running_ = false;
-  bool waiting_ = true;
-  static std::shared_ptr<Communicator> communicator_;
-  static std::once_flag init_flag_;
-  std::unordered_map<std::string, std::string> envs;
-};
-
-class AsyncCommunicator : public Communicator {
- public:
-  AsyncCommunicator() : Communicator() {}
-
-  explicit AsyncCommunicator(const std::map<std::string, std::string> &envs)
-      : Communicator(envs) {}
-
-  ~AsyncCommunicator();
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ =
-        std::stoi(envs.at("communicator_min_send_grad_num_before_recv"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-    VLOG(0) << "AsyncCommunicator Initialized";
-  }
-
-  void Start() override;
-
-  void Stop() override;
-
-  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                const RpcCtxMap &recv_varname_to_ctx,
-                Scope *recv_scope) override;
-
-  void InitParams();
-
-  virtual void MainThread();
-
-  void Send(const std::vector<std::string> &var_names,
-            const std::vector<std::string> &var_tables,
-            const framework::Scope &scope) override;
-
-  virtual void SendByCommunicator();
-  virtual void SendGlobalStep(int batches);
-
-  virtual void RecvByCommunicator();
-
-  virtual void RecvNoBarrier();
-
-  virtual void BarrierSend() {}
-
-  virtual void BarrierRecv() {}
-
-  virtual void BarrierWeakUp() {}
-
- protected:
-  int min_send_grad_num_before_recv_;
-  int thread_pool_size_;
-  int max_merge_var_num_;
-  int send_wait_times_;
-  int send_queue_size_;
-  int trainer_id_ = 0;
-  bool need_global_step_ = false;
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<BlockingQueue<std::shared_ptr<Variable>>>>
-      send_varname_to_queue_;
-  RpcCtxMap send_varname_to_ctx_;
-  RpcCtxMap recv_varname_to_ctx_;
-  std::unique_ptr<std::thread> main_thread_{nullptr};
-  Scope *recv_scope_;                  // should be global scope
-  std::unique_ptr<Scope> send_scope_;  // an independent scope
-  std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
-  std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr};
-  std::atomic_uint grad_num_{0};  // the num of gradient sent since last recv
-};
-
-class HalfAsyncCommunicator : public AsyncCommunicator {
- public:
-  HalfAsyncCommunicator() {}
-
-  explicit HalfAsyncCommunicator(const std::map<std::string, std::string> &envs)
-      : AsyncCommunicator(envs) {}
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-    VLOG(0) << "HalfAsyncCommunicator Initialized";
-  }
-
-  void MainThread() override;
-
-  void SendByCommunicator() override;
-
-  void Clean() override;
-
-  void Barrier() override;
-
-  void BarrierTriggerDecrement() override;
-
-  void BarrierTriggerReset(int initial_val) override;
-
-  int BatchesCounter();
-
-  void BarrierWeakUp();
-
- protected:
-  // mutex for Wait for barrier
-  std::mutex barrier_mutex_;
-  std::condition_variable barrier_cond_;
-  std::atomic<int64_t> barrier_trigger_{0};
-  std::atomic<int64_t> barrier_counter_{0};
-};
-
-class SyncCommunicator : public HalfAsyncCommunicator {
- public:
-  SyncCommunicator() : HalfAsyncCommunicator() {}
-
-  explicit SyncCommunicator(const std::map<std::string, std::string> &envs)
-      : HalfAsyncCommunicator(envs) {}
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-
-    trainer_id_ = std::stoi(envs.at("trainer_id"));
-    auto pserver_strings = envs.at("pserver_endpoints");
-    pserver_endpoints_ = paddle::string::Split(pserver_strings, ',');
-    VLOG(0) << "SyncCommunicator Initialized";
-  }
-
-  void BarrierSend();
-
-  void BarrierRecv();
-
- private:
-  std::vector<std::string> pserver_endpoints_{};
-};
-
-class GeoCommunicator : public AsyncCommunicator {
- public:
-  GeoCommunicator() : AsyncCommunicator() {}
-
-  explicit GeoCommunicator(const std::map<std::string, std::string> &envs)
-      : AsyncCommunicator(envs) {}
-
-  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                const RpcCtxMap &recv_varname_to_ctx,
-                Scope *recv_scope) override;
-  void MainThread() override;
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-
-    send_queue_size_ = max_merge_var_num_;
-    trainers_ = std::stoi(envs.at("trainers"));
-    sparse_attrs_ = envs.at("sparse_attrs");
-    VLOG(0) << "GeoCommunicator Initialized";
-  }
-
-  void Send(const std::vector<std::string> &var_names,
-            const std::vector<std::string> &var_tables,
-            const framework::Scope &scope) override;
-
-  void SendByCommunicator() { return; }
-
-  std::vector<int64_t> MergeSparseIds(const std::string &send_varname);
-
-  void SendSparse(const std::string &varname, int ep_idx,
-                  const std::vector<int64_t> &sparse_ids);
-
-  void SendDense(const std::string &varname);
-
-  void SendGlobalStep(int batches) override {}
-
-  void RecvByCommunicator() override;
-
-  void RecvSparse(const std::string &varname, int ep_idx);
-
-  void RecvDense(const std::string &varname);
-
-  void InitParams();
-
-  void InitSparse();
-
-  void InitDense(const std::string varname);
-
- private:
-  int trainers_;
-  std::string sparse_attrs_;
-
-  // parameter for delta calc and send
-  std::shared_ptr<Scope> delta_scope_;
-
-  // parameter for storage the pserver param after last recv
-  std::shared_ptr<Scope> old_scope_;
-
-  // parameter on pserver
-  std::shared_ptr<Scope> pserver_scope_;
-
-  int send_var_nums_ = 0;
-
-  std::unordered_map<std::string, std::shared_ptr<SparseValue>> old_sparses_;
-
-  std::unordered_map<
-      std::string,
-      std::shared_ptr<BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>>
-      sparse_id_queues_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator_common.h b/paddle/fluid/operators/distributed/communicator_common.h
deleted file mode 100644
index 122d904eba27aa..00000000000000
--- a/paddle/fluid/operators/distributed/communicator_common.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct CommContext {
-  CommContext() = default;
-
-  CommContext(const std::string &name, const std::vector<std::string> &names,
-              const std::vector<std::string> &emap,
-              const std::vector<int64_t> &sections,
-              const std::vector<std::string> &origin_names, int id,
-              bool merge_add_ = true, bool is_sparse_ = true,
-              bool is_distributed_ = false)
-      : var_name(name),
-        splited_varnames(names),
-        epmap(emap),
-        height_sections(sections),
-        origin_varnames(origin_names),
-        trainer_id(id),
-        merge_add(merge_add_),
-        is_sparse(is_sparse_),
-        is_distributed(is_distributed_) {}
-
-  CommContext(const CommContext &ctx) {
-    var_name = ctx.var_name;
-    splited_varnames = ctx.splited_varnames;
-    epmap = ctx.epmap;
-    height_sections = ctx.height_sections;
-    trainer_id = ctx.trainer_id;
-    merge_add = ctx.merge_add;
-    is_sparse = ctx.is_sparse;
-    origin_varnames = ctx.origin_varnames;
-    is_distributed = ctx.is_distributed;
-  }
-
-  std::string print() const {
-    std::stringstream ss;
-
-    ss << "varname: " << var_name << " trainer_id: " << trainer_id << " ";
-
-    for (size_t i = 0; i < splited_varnames.size(); i++) {
-      ss << "slice varname: " << splited_varnames[i] << " ep: " << epmap[i]
-         << " section: " << height_sections[i] << " ";
-    }
-
-    ss << "origin varnames: ";
-    for (size_t i = 0; i < origin_varnames.size(); i++) {
-      ss << origin_varnames[i] << " ";
-    }
-
-    ss << " aggregation->add: " << merge_add << " ";
-    ss << " is_sparse: " << is_sparse << "\n";
-    ss << " is_distributed: " << is_distributed << "\n";
-
-    return ss.str();
-  }
-
-  std::string var_name;
-  std::vector<std::string> splited_varnames;
-  std::vector<std::string> epmap;
-  std::vector<int64_t> height_sections;
-  std::vector<std::string> origin_varnames;
-  int trainer_id;
-  bool merge_add;
-  bool is_sparse;
-  bool is_distributed;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc
deleted file mode 100644
index 38b7c8b00317e6..00000000000000
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-TEST(communicator, merge_lod_tensors) {
-  auto cpu_place = platform::CPUPlace();
-  auto dims = framework::make_ddim({2, 3});
-  std::vector<std::shared_ptr<framework::Variable>> in_vars;
-  float out_value = 0;
-  for (auto i = 0; i < 10; ++i) {
-    auto var = std::make_shared<Variable>();
-    in_vars.emplace_back(var);
-    auto *tensor = var->GetMutable<LoDTensor>();
-    auto *data = tensor->mutable_data<float>(dims, cpu_place);
-    for (auto j = 0; j < tensor->numel(); ++j) {
-      data[j] = static_cast<float>(i);
-    }
-    out_value += static_cast<float>(i);
-  }
-  const std::string out_name = "Out";
-  std::unique_ptr<framework::Scope> scope;
-  scope.reset(new framework::Scope());
-  scope->Var(out_name);
-  for (auto i = 0; i < 10; ++i) {
-    MergeVars<float>(out_name, in_vars, scope.get());
-  }
-  auto &out_tensor = scope->FindVar(out_name)->Get<LoDTensor>();
-  auto *out_data = out_tensor.data<float>();
-  ASSERT_EQ(out_tensor.dims(), dims);
-  for (auto i = 0; i < out_tensor.numel(); ++i) {
-    ASSERT_EQ(out_data[i], out_value);
-  }
-}
-
-TEST(communicator, merge_selected_rows) {
-  auto cpu_place = platform::CPUPlace();
-  int64_t width = 10;
-  std::vector<std::shared_ptr<framework::Variable>> in_vars;
-  const int64_t height = 100;
-  for (auto i = 0; i < 10; ++i) {
-    std::vector<int64_t> rows;
-    for (auto k = 0; k <= i; ++k) {
-      rows.push_back(k);
-    }
-    auto var = std::make_shared<Variable>();
-    in_vars.emplace_back(var);
-    auto *slr = var->GetMutable<SelectedRows>();
-    slr->set_height(height);
-    slr->set_rows(rows);
-    auto dims =
-        framework::make_ddim({static_cast<int64_t>(rows.size()), width});
-    auto *data = slr->mutable_value()->mutable_data<float>(dims, cpu_place);
-    for (size_t i = 0; i < rows.size(); ++i) {
-      for (auto j = 0; j < width; ++j) {
-        data[i * width + j] = static_cast<float>(rows[i]);
-      }
-    }
-  }
-  const std::string out_name = "Out";
-  std::unique_ptr<framework::Scope> scope;
-  scope.reset(new framework::Scope());
-  scope->Var(out_name);
-  for (auto i = 0; i < 10; ++i) {
-    MergeVars<float>(out_name, in_vars, scope.get());
-  }
-  auto &out_slr = scope->FindVar(out_name)->Get<SelectedRows>();
-  auto &out_t = out_slr.value();
-  auto *out_data = out_t.data<float>();
-  ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width}));
-  std::vector<float> out_values;
-  out_values.reserve(10);
-  for (auto i = 0; i < 10; ++i) {
-    out_values.push_back(static_cast<float>(i * (10 - i)));
-  }
-  for (size_t i = 0; i < out_slr.rows().size(); ++i) {
-    ASSERT_EQ(out_slr.rows()[i], static_cast<int>(i));
-    for (auto j = 0; j < width; ++j) {
-      ASSERT_EQ(out_data[i * width + j], out_values[i]);
-    }
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h
deleted file mode 100644
index 5917c18fb0d201..00000000000000
--- a/paddle/fluid/operators/distributed/distributed.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-#ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
-#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
-#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
-
-#else  // PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
-#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
-#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
-
-#endif  // PADDLE_WITH_GRPC
-
-#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
deleted file mode 100644
index 7d6756b41363d1..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
deleted file mode 100644
index 486870de7a554e..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-struct grpc_byte_buffer;
-
-namespace grpc {
-// A ZeroCopyInputStream that reads from grpc_byte_buffer
-class ByteBuffer;
-
-class GrpcBufferReader final
-    : public ::google::protobuf::io::ZeroCopyInputStream {
-  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    (g_core_codegen_interface->*ptr)(reader, buffer);
-  }
-  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
-    (void)result;
-  }
-
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
-               buffer);
-  }
-  ~GrpcBufferReader() override {
-    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
-  }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = static_cast<int>(backup_count_);
-      backup_count_ = 0;
-      return true;
-    }
-    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
-                                                                &slice_)) {
-      return false;
-    }
-    g_core_codegen_interface->grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  ::google::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-};  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource
-    : public ::google::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::google::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  size_t cur_;       // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::google::protobuf::int64 byte_count_;
-};
-
-class GrpcByteBufferSourceWrapper : public Source {
- public:
-  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
-      : source_(source) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return source_;
-  }
-
- private:
-  GrpcByteBufferSource* source_;
-};
-
-class GrpcByteSource : public Source {
- public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
-  ~GrpcByteSource() override { DeleteStream(); }
-
-  typedef ::grpc::GrpcBufferReader Reader;
-
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    DeleteStream();
-    stream_ = new (&space_) Reader(buffer_);
-    return stream_;
-  }
-
- private:
-  void DeleteStream() {
-    if (stream_) {
-      stream_->~Reader();
-    }
-  }
-
-  grpc_byte_buffer* buffer_;  // Not owned
-  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
-  char space_[sizeof(Reader)];
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
deleted file mode 100644
index 97a9c14e4f1850..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ /dev/null
@@ -1,671 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <limits>
-
-#include "glog/logging.h"  // For VLOG
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(rpc_client_threads, 2, "");
-DECLARE_bool(rpc_disable_reuse_port);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void GRPCClient::InitImpl() {
-  // start the client process thread
-  // TODO(wuyi): can make this in a threadpool
-  client_threads_.resize(FLAGS_rpc_client_threads);
-  for (int i = 0; i < FLAGS_rpc_client_threads; i++) {
-    client_threads_[i].reset(
-        new std::thread(std::bind(&GRPCClient::Proceed, this)));
-  }
-}
-
-void GRPCClient::SendComplete() {
-  std::unique_lock<std::mutex> lk(completed_mutex_);
-  if (!completed_) {
-    for (auto& it : channels_) {
-      VLOG(3) << "send complete message to " << it.first;
-      this->AsyncSendComplete(it.first);
-    }
-    PADDLE_ENFORCE_EQ(this->Wait(), true, platform::errors::PreconditionNotMet(
-                                              "internal grpc service error."));
-    completed_ = true;
-  }
-}
-
-GRPCClient::~GRPCClient() {
-  stopped_ = true;
-  Wait();
-  cq_.Shutdown();
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    for (auto& it : channels_) {
-      it.second.reset();
-    }
-    channels_.clear();
-  }
-  for (size_t i = 0; i < client_threads_.size(); i++)
-    client_threads_[i]->join();
-}
-
-VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kSendRPC;
-
-  int retry_times_ = 0;
-
-  while (true) {
-    SendProcessor* s = new SendProcessor(ch);
-    VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] {
-      auto* var = p_scope->FindVar(var_name_val);
-
-      ::grpc::ByteBuffer req;
-      SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = nullptr;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call = s->stub_g_.PrepareUnaryCall(
-          s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req,
-          &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(4) << "ProcGetResponse";
-  framework::Variable* outvar = nullptr;
-  // get response's trainer_id is not used
-  int trainer_id;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
-                            &trainer_id);
-}
-
-void ProcGetRecvResponse(const VarHandle& var_h,
-                         const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(4) << "ProcGetRecvResponse";
-  framework::Variable* outvar = nullptr;
-  int trainer_id;
-  DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
-                                &trainer_id);
-}
-
-template <typename T>
-void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
-  ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
-  ::grpc::ByteBuffer tmp(&slice, 1);
-  result->Swap(&tmp);
-}
-
-VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
-                                     const platform::DeviceContext& ctx,
-                                     const framework::Scope& scope,
-                                     const std::string& var_name,
-                                     const std::string& out_varname,
-                                     const std::string& table_name,
-                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
-                      "/sendrecv.SendRecvService/GetVariable", table_name,
-                      time_out);
-}
-
-VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    const std::string& out_varname, int64_t time_out) {
-  std::string var_name_no_barrier =
-      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
-
-  return _AsyncGetVar(
-      ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
-      "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out);
-}
-
-VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
-                      "/sendrecv.SendRecvService/GetMonomerVariable", "",
-                      time_out);
-}
-
-VarHandlePtr GRPCClient::_AsyncGetVar(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& method,
-    const std::string& var_name, const std::string& out_varname,
-    const std::string& rpc_path, const std::string& table_name,
-    int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const std::string out_varname_val = out_varname;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  int retry_times_ = 0;
-
-  while (true) {
-    GetProcessor* s = new GetProcessor(ch);
-
-    VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::Async([var_name_val, out_varname_val, table_name_val, s, method,
-                      p_ctx, h, rpc_path, this] {
-      // prepare input
-      sendrecv::VariableMessage req;
-      req.set_varname(var_name_val);
-      req.set_out_varname(out_varname_val);
-      req.set_trainer_id(trainer_id_);
-      req.set_table_name(table_name_val);
-      ::grpc::ByteBuffer buf;
-      RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = ProcGetResponse;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call =
-          s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& in_var_name,
-                                          const std::string& out_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  const std::string method = kPrefetchRPC;
-  int retry_times_ = 0;
-
-  while (true) {
-    GetProcessor* s = new GetProcessor(ch);
-    VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-    s->Prepare(h, kPrefetchTimeout);
-
-    auto* var = p_scope->FindVar(in_var_name_val);
-
-    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
-                          0, table_name_val);
-
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-    // stub context
-    s->response_call_back_ = ProcGetResponse;
-
-    platform::RecordRPCEvent record_event(method);
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
-        &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      h->Wait();
-    }
-
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kBatchBarrierRPC;
-  VarHandlePtr h(
-      new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(BATCH_BARRIER_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  const std::string method = kFetchBarrierRPC;
-  VarHandlePtr h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
-                                                const std::string& var_name,
-                                                int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kSendMonomerFetchBarrierRPC;
-  VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
-
-  sendrecv::VariableMessage req;
-  req.set_varname(var_name);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
-                                           int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kSendCompleteRPC;
-  VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_trainer_id(trainer_id_);
-  req.set_varname(COMPLETE_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                               const std::string& dirname,
-                                               const std::string& varname,
-                                               const int mode,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-
-  const std::string method = kCheckPointNotifyRPC;
-
-  VarHandlePtr h(
-      new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(varname);
-  req.set_table_name(std::to_string(mode));
-  req.set_out_varname(dirname);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncDistributeNotify(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kRequestNotify;
-
-  SendProcessor* s = new SendProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-  s->Prepare(h, time_out);
-
-  framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] {
-    auto* var = p_scope->FindVar(var_name_val);
-
-    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
-
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-    // stub context
-    s->response_call_back_ = nullptr;
-
-    platform::RecordRPCEvent record_event(method);
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req,
-        &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  });
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& send_var_name,
-                                          const std::string& recv_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string send_var_name_val = send_var_name;
-  const std::string recv_var_name_val = recv_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kSendAndRecvRPC;
-  VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
-          << send_var_name_val << " Recv_var_name: " << recv_var_name_val;
-  int retry_times_ = 0;
-
-  while (true) {
-    SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
-    VarHandlePtr h(
-        new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
-    VarHandlePtr h_recv(
-        new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-    s->RecvPrepare(h_recv);
-
-    framework::Async([send_var_name_val, recv_var_name_val, table_name_val,
-                      p_scope, p_ctx, s, method, h, this] {
-      auto* send_var = p_scope->FindVar(send_var_name_val);
-      send_var->GetMutable<framework::LoDTensor>()->set_lod({});
-      ::grpc::ByteBuffer buf;
-      VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
-              << send_var_name_val
-              << " recv_var_name_val: " << recv_var_name_val;
-      SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
-                            recv_var_name_val, trainer_id_, table_name_val);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = ProcGetRecvResponse;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call = s->stub_g_.PrepareUnaryCall(
-          s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
-          buf, &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-bool GRPCClient::Wait() {
-  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
-  return ok_;
-}
-
-inline bool ShouldRetry(const std::string& method, int error_code) {
-  if (method == kPrefetchRPC) {
-    return true;
-  }
-
-  if (error_code == grpc::StatusCode::DEADLINE_EXCEEDED) {
-    return true;
-  }
-
-  return false;
-}
-
-void GRPCClient::Proceed() {
-  void* tag = nullptr;
-  bool ok = false;
-
-  VLOG(3) << "GRPCClient Proceed begin";
-  while (!stopped_ && cq_.Next(&tag, &ok)) {
-    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
-    GPR_ASSERT(ok);
-    PADDLE_ENFORCE_NOT_NULL(
-        c, platform::errors::PreconditionNotMet("Make BaseProcessor failed."));
-
-    if (c->status_.ok()) {
-      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
-      c->Process();
-    } else if (ShouldRetry(c->GetVarHandlePtr()->method(),
-                           c->status_.error_code())) {
-      VLOG(0) << c->GetVarHandlePtr()->String()
-              << " meets grpc error, error_code:" << c->status_.error_code()
-              << " error_message:" << c->status_.error_message()
-              << " error_details:" << c->status_.error_details()
-              << " should retry!";
-      c->GetVarHandlePtr()->should_retry = true;
-      c->Finish(false);
-    } else {
-      PADDLE_THROW(platform::errors::External(
-          "%s meets grpc error, error_code is %d, error message is %s, error "
-          "details is %s.",
-          c->GetVarHandlePtr()->String(), c->status_.error_code(),
-          c->status_.error_message(), c->status_.error_details()));
-      c->Finish(false);
-    }
-
-    bool notify = false;
-    {
-      std::lock_guard<std::mutex> lk(sync_mutex_);
-      req_count_--;
-      notify = (req_count_ <= 0 || !c->status_.ok());
-    }
-
-    delete c;
-
-    if (notify) {
-      sync_cond_.notify_all();
-    }
-  }
-
-  // Last log message
-  // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a
-  // static Mutex log_mutex is used for synchronization, which might have been
-  // destructed at this moment.
-  if (FLAGS_v >= 3) {
-    std::string msg("GRPCClient Proceed end");
-    fwrite(msg.c_str(), msg.length(), 1, stderr);
-  }
-}
-
-std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  std::lock_guard<std::mutex> guard(chan_mutex_);
-  auto it = channels_.find(ep);
-  if (it != channels_.end()) {
-    return it->second;
-  }
-
-  // Channel configurations:
-  grpc::ChannelArguments args;
-  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
-  if (FLAGS_rpc_disable_reuse_port) {
-    args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
-  }
-  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
-  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-
-  auto ch =
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-  channels_[ep] = ch;
-  return ch;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
deleted file mode 100644
index 5885f944b60a15..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-#include <atomic>
-#include <chrono>              // NOLINT
-#include <condition_variable>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <vector>
-
-#include "grpc++/channel.h"
-#include "grpc++/generic/generic_stub.h"
-#include "grpc++/grpc++.h"
-#include "grpc++/support/byte_buffer.h"
-#include "grpc++/support/slice.h"
-#include "grpc/support/log.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace grpc {
-class Channel;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-
-void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-
-class BaseProcessor {
- public:
-  BaseProcessor() { context_ = nullptr; }
-
-  virtual ~BaseProcessor() {}
-
-  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
-    var_h_ = h;
-
-    context_.reset(new grpc::ClientContext());
-    context_->set_wait_for_ready(true);
-    if (time_out) {
-      std::chrono::system_clock::time_point deadline =
-          std::chrono::system_clock::now() +
-          std::chrono::milliseconds(time_out);
-      context_->set_deadline(deadline);
-    }
-  }
-
-  void Process() {
-    ProcessImpl();
-    var_h_->Finish(true);
-  }
-
-  VarHandlePtr GetVarHandlePtr() { return var_h_; }
-  bool Wait() { return var_h_->Wait(); }
-  void Finish(bool ok) { return var_h_->Finish(ok); }
-  virtual void ProcessImpl() = 0;
-
-  std::unique_ptr<grpc::ClientContext> context_;
-  grpc::Status status_;
-
- protected:
-  VarHandlePtr var_h_;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestSendCallBack;
-
-class SendProcessor : public BaseProcessor {
- public:
-  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~SendProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_.get(), reply_);
-    }
-  }
-
-  ::grpc::GenericStub stub_g_;
-  ::grpc::ByteBuffer reply_;
-  RequestSendCallBack response_call_back_ = nullptr;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestGetCallBack;
-
-class GetProcessor : public BaseProcessor {
- public:
-  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~GetProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_.get(), reply_);
-    }
-  }
-
-  ::grpc::ByteBuffer reply_;
-  ::grpc::GenericStub stub_g_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-};
-
-class SendAndRecvProcessor : public BaseProcessor {
- public:
-  explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~SendAndRecvProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_recv_.get(), reply_);
-      var_h_recv_->Finish(true);
-    }
-  }
-
-  void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
-
-  ::grpc::ByteBuffer reply_;
-  ::grpc::GenericStub stub_g_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-  VarHandlePtr var_h_recv_;
-};
-
-class BatchBarrierProcessor : public BaseProcessor {
- public:
-  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~BatchBarrierProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class FetchBarrierProcessor : public BaseProcessor {
- public:
-  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~FetchBarrierProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VariableMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class CheckpointNotifyProcessor : public BaseProcessor {
- public:
-  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~CheckpointNotifyProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class GRPCClient : public RPCClient {
- public:
-  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
-  virtual ~GRPCClient();
-
-  VarHandlePtr AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           const std::string& out_varname,
-                           const std::string& table_name = "",
-                           int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVarNoBarrier(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_varname,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out) override;
-
-  VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncDistributeNotify(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendAndRecv(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& send_var_name,
-                                const std::string& recv_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendComplete(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool Wait() override;
-
-  void SendComplete() override;
-
-  void InitImpl() override;
-
- private:
-  void Proceed();
-
-  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  VarHandlePtr _AsyncGetVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& method,
-      const std::string& var_name, const std::string& out_varname,
-      const std::string& rpc_path, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline);
-
- private:
-  grpc::CompletionQueue cq_;
-  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  std::vector<std::unique_ptr<std::thread>> client_threads_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-  bool ok_;
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(GRPCClient);
-
-  // mutex for sending complete message only once
-  std::mutex completed_mutex_;
-  bool completed_;
-
-  volatile bool stopped_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
deleted file mode 100644
index 0fc9b695779149..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
-#ifdef PADDLE_WITH_RCCL
-#include <rccl.h>
-#endif
-#include <limits>
-#include <memory>
-#include "grpcpp/impl/codegen/byte_buffer.h"
-#include "grpcpp/impl/codegen/slice.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg, const std::string& out_name,
-                           const int trainer_id,
-                           const std::string& table_name) {
-  platform::RecordRPCEvent record_event("serial");
-  VarMsg request;
-  TensorPayload* payload = nullptr;
-
-  request.set_varname(name);
-  request.set_trainer_id(trainer_id);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (!table_name.empty()) {
-    request.set_table_name(table_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Serialize does not support type: %s", typeid(var->Type()).name()));
-  }
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-  PADDLE_ENFORCE_NOT_NULL(
-      payload,
-      platform::errors::InvalidArgument(
-          "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS",
-          var->Type()));
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                            payload->memory_size());
-  if (payload->memory_size() >= std::numeric_limits<int>::max()) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Variable %s length %d should less than %d.", name,
-        payload->memory_size(), std::numeric_limits<int>::max()));
-  }
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(),
-                                    SerializeDestroyCallback, payload),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-
-    PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(),
-                      platform::errors::InvalidArgument(
-                          "Got wrong type %s, expect type: int64_t",
-                          VectorElemName(slr->rows())));
-    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
-
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial");
-  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(msg), 0,
-      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
-  *var = resp.GetVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope* scope,
-                                   framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial");
-  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(msg), 0,
-      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
-  *var = resp.GetRecvVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
deleted file mode 100644
index 932f3e2f069a2b..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-typedef void (*DestroyCallback)(void*);
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string(),
-                           const int trainer_id = 0,
-                           const std::string& table_name = std::string());
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var, int* trainer_id);
-
-void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope* scope,
-                                   framework::Variable** var, int* trainer_id);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
deleted file mode 100644
index d407a72938a741..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* slr = var.GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({564, 128}));
-  tensor->mutable_data<float>(place);
-  int tensor_numel = 564 * 128;
-  math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-  ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-
-  // deserialize bytebuffer
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 1);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  const int64_t* rows_data =
-      reinterpret_cast<const int64_t*>(varmsg.rows().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
-  }
-  for (int i = 0; i < 564; ++i) {
-    EXPECT_EQ(rows_data[i], i);
-  }
-
-  // deserialize zero-copy
-  // framework::Variable var2;
-  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
-  EXPECT_EQ(resp.Parse(msg), 0);
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-  auto* tensor2 = slr2->mutable_value();
-  auto* rows2 = slr2->mutable_rows();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2->data<float>());
-  }
-  const int64_t* rows_data2 = rows2->data();
-
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-  }
-  for (size_t i = 0; i < rows2->size(); ++i) {
-    EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-  }
-  EXPECT_EQ(slr2->height(), 1000);
-}
-
-void RunTestLodTensor(platform::Place place, int from_type = 0) {
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-  framework::LoD lod;
-  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-  tensor->set_lod(lod);
-  int tensor_numel = 512 * 8 * 4 * 2;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-  tensor->mutable_data<float>(place);
-  math::set_constant(ctx, tensor, 31.9);
-
-  ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg,
-                                                "outvar", 0, "table_name");
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 0);
-  EXPECT_EQ(varmsg.dims()[0], 512);
-  EXPECT_EQ(varmsg.dims()[1], 8);
-  EXPECT_EQ(varmsg.dims()[2], 4);
-  EXPECT_EQ(varmsg.dims()[3], 2);
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
-  }
-
-  // message binary
-  std::string str;
-  varmsg.SerializeToString(&str);
-
-  // message bytebuffer
-  ::grpc::Slice slices_2[1];
-  int num_slices = 1;
-  slices_2[0] = ::grpc::Slice(str.length());
-  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
-  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
-
-  // deserialize zero-copy
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
-  if (from_type == 0) {
-    EXPECT_EQ(resp.Parse(msg), 0);
-  } else {
-    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
-  }
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto tensor2 = var2->Get<framework::LoDTensor>();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2.data<float>());
-  }
-
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-  RunTestLodTensor(place, 1);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-  RunTestLodTensor(gpu, 1);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
deleted file mode 100644
index 912520d782d756..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ /dev/null
@@ -1,720 +0,0 @@
-/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <limits>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
-
-namespace grpc {
-class ChannelArguments;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace operators {
-namespace distributed {
-class GRPCVariableResponse;
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-using ::grpc::ServerAsyncResponseWriter;
-
-DECLARE_bool(rpc_disable_reuse_port);
-DECLARE_int32(rpc_retry_bind_port);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum CallStatus { PROCESS = 0, FINISH };
-
-// reference:
-// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
-class RequestBase {
- public:
-  explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : service_(service),
-        cq_(cq),
-        status_(PROCESS),
-        request_handler_(request_handler),
-        req_id_(req_id) {
-    PADDLE_ENFORCE_NOT_NULL(cq_, platform::errors::InvalidArgument(
-                                     "ServerCompletionQueue cq are empty"));
-  }
-  virtual ~RequestBase() {}
-  virtual void Process() = 0;
-
-  std::string Status2String(const std::string& method) {
-    std::string status = "Process";
-    if (status_ == FINISH) {
-      status = "Finish";
-    }
-
-    std::ostringstream s;
-    s << method << " name:[" << GetReqName() << "]"
-      << ", ep:[" << ctx_.peer() << "]"
-      << " " << status << " using req_id:" << req_id_;
-    return s.str();
-  }
-
-  CallStatus Status() const {
-    std::lock_guard<std::mutex> l(status_mu_);
-    return status_;
-  }
-
-  template <typename T>
-  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
-    std::lock_guard<std::mutex> l(status_mu_);
-    status_ = FINISH;
-    responder->Finish(reply, ::grpc::Status::OK,
-                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
-  }
-  virtual std::string GetReqName() = 0;
-
- protected:
-  mutable std::mutex status_mu_;
-  ::grpc::ServerContext ctx_;
-  GrpcService::AsyncService* service_;
-  ::grpc::ServerCompletionQueue* cq_;
-  CallStatus status_;
-  RequestHandler* request_handler_;
-  int req_id_;
-};
-
-class RequestSend final : public RequestBase {
- public:
-  explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-  virtual ~RequestSend() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string varname = GetReqName();
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = request_->GetVar();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestSend var_name:" << varname << " trainer: " << trainer_id;
-
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VoidMessage reply_;
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestGet final : public RequestBase {
- public:
-  explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq,
-                      RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGet() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    std::string out_varname = request_.out_varname();
-    std::string table_name = request_.table_name();
-    int trainer_id = request_.trainer_id();
-
-    VLOG(4) << "RequestGet " << out_varname << " from " << varname;
-
-    auto scope = request_handler_->scope();
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    tmp_scope_ = std::move(scope->NewTmpScope());
-    request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar,
-                             trainer_id, out_varname, table_name);
-
-    VLOG(1) << "before SerializeToByteBuffer";
-    if (outvar) {
-      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    VLOG(1) << "after SerializeToByteBuffer";
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  std::unique_ptr<framework::Scope> tmp_scope_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestGetNoBarrier final : public RequestBase {
- public:
-  explicit RequestGetNoBarrier(GrpcService::AsyncService* service,
-                               ::grpc::ServerCompletionQueue* cq,
-                               RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetVariableNoBarrier);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetNoBarrier() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    std::string out_varname = request_.out_varname();
-    int trainer_id = request_.trainer_id();
-
-    VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname;
-
-    auto scope = request_handler_->scope();
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
-                             out_varname);
-
-    if (outvar) {
-      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestGetMonomerVariable final : public RequestBase {
- public:
-  explicit RequestGetMonomerVariable(GrpcService::AsyncService* service,
-                                     ::grpc::ServerCompletionQueue* cq,
-                                     RequestHandler* request_handler,
-                                     int req_id, RPCServer* rpc_server)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        rpc_server_(rpc_server) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetMonomerVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetMonomerVariable() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-
-    rpc_server_->WaitVarCond(varname);
-    MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    auto scope = h.scope_;
-    auto invar = scope->FindVar(varname);
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar,
-                             request_.trainer_id());
-
-    if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  RPCServer* rpc_server_{nullptr};
-};
-
-class RequestGetMonomerBarrier final : public RequestBase {
- public:
-  explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service,
-                                    ::grpc::ServerCompletionQueue* cq,
-                                    RequestHandler* request_handler, int req_id,
-                                    RPCServer* rpc_server)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        rpc_server_(rpc_server) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetMonomerBarrier);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetMonomerBarrier() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    VLOG(4) << "RequestGetMonomerBarrier " << varname;
-
-    rpc_server_->WaitVarCond(varname);
-    MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    framework::Scope* scope = nullptr;
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar,
-                             request_.trainer_id());
-
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VoidMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-  RPCServer* rpc_server_{nullptr};
-};
-
-class RequestPrefetch final : public RequestBase {
- public:
-  explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq,
-                           RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        local_scope_(nullptr) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestPrefetch() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    // prefetch process...
-    std::string in_var_name = request_->Varname();
-    std::string out_var_name = request_->OutVarname();
-    std::string table_name = request_->TableName();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    // out var must be created in local scope!
-    framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name, table_name);
-
-    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
-                          &reply_);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* local_scope_;
-};
-
-class RequestCheckpointNotify final : public RequestBase {
- public:
-  explicit RequestCheckpointNotify(GrpcService::AsyncService* service,
-                                   ::grpc::ServerCompletionQueue* cq,
-                                   RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx()));
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestCheckpointNotify() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    auto scope = request_->GetMutableLocalScope();
-
-    std::string checkpoint_notify = request_->Varname();
-    std::string checkpoint_dir = request_->OutVarname();
-    int trainer_id = request_->GetTrainerId();
-    std::string table_name = request_->TableName();
-
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir;
-
-    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                             trainer_id, checkpoint_dir, table_name);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  sendrecv::VoidMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestNotify final : public RequestBase {
- public:
-  explicit RequestNotify(GrpcService::AsyncService* service,
-                         ::grpc::ServerCompletionQueue* cq,
-                         RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(distributed::GrpcMethod::kRequestNotify);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-  virtual ~RequestNotify() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string varname = GetReqName();
-    VLOG(4) << "RequestNotify var_name:" << varname;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = request_->GetVar();
-    int trainer_id = request_->GetTrainerId();
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VoidMessage reply_;
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestSendAndRecv final : public RequestBase {
- public:
-  explicit RequestSendAndRecv(GrpcService::AsyncService* service,
-                              ::grpc::ServerCompletionQueue* cq,
-                              RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
-
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestSendAndRecv() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string in_var_name = request_->Varname();
-    std::string out_var_name = request_->OutVarname();
-    std::string table_name = request_->TableName();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name, table_name);
-    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
-                          &reply_);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is waiting server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
-}
-
-// Define an option subclass in order to disable SO_REUSEPORT for the
-// server socket.
-// Come from:
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
-class NoReusePortOption : public ::grpc::ServerBuilderOption {
- public:
-  void UpdateArguments(::grpc::ChannelArguments* args) override {
-    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
-  }
-
-  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
-                         plugins) override {}
-};
-
-void AsyncGRPCServer::StartServer() {
-  for (int i = 0; i < FLAGS_rpc_retry_bind_port; i++) {
-    ::grpc::ServerBuilder builder;
-    std::unique_ptr<GrpcService::AsyncService> service(
-        new GrpcService::AsyncService());
-    builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
-                             &selected_port_);
-
-    builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-    builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-    if (FLAGS_rpc_disable_reuse_port) {
-      builder.SetOption(
-          std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
-      LOG(INFO) << "set FLAGS_rpc_disable_reuse_port";
-    }
-    builder.RegisterService(service.get());
-
-    for (auto t : rpc_call_map_) {
-      rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
-    }
-
-    server_ = builder.BuildAndStart();
-    if (selected_port_ != 0) {
-      LOG(INFO) << "Server listening on " << bind_address_
-                << " successful, selected port: " << selected_port_;
-      service_.reset(service.release());
-      break;
-    }
-
-    LOG(WARNING) << "Server listening on " << bind_address_
-                 << " failed, selected port: " << selected_port_
-                 << ", retry after 3 seconds!";
-
-    sleep(3);
-  }
-
-  PADDLE_ENFORCE_NE(
-      selected_port_, 0,
-      platform::errors::Unavailable("can't bind to address:%s", bind_address_));
-
-  std::function<void(const std::string&, int)> f =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
-                std::placeholders::_1, std::placeholders::_2);
-
-  for (auto& t : rpc_call_map_) {
-    auto& rpc_name = t.first;
-    auto& cq = rpc_cq_[rpc_name];
-    auto threadnum = rpc_thread_num_[rpc_name];
-    auto& reqs = rpc_reqs_[rpc_name];
-
-    reqs.reserve(kRequestBufSize);
-
-    for (int i = 0; i < kRequestBufSize; i++) {
-      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
-      TryToRegisterNewOne(rpc_name, i);
-    }
-
-    for (int i = 0; i < threadnum; i++) {
-      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
-          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(4) << t.first << " creates threads!";
-    }
-  }
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  // wait server
-  server_->Wait();
-
-  for (auto& t : rpc_threads_) {
-    auto& threads = t.second;
-    for (size_t i = 0; i < threads.size(); ++i) {
-      threads[i]->join();
-      VLOG(4) << t.first << " threads ends!";
-    }
-  }
-}
-
-void AsyncGRPCServer::ShutdownQueue() {
-  for (auto& t : rpc_cq_) {
-    t.second->Shutdown();
-    VLOG(4) << t.first << " queue shutdown!";
-  }
-}
-
-void AsyncGRPCServer::ShutDownImpl() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  is_shut_down_ = true;
-  ShutdownQueue();
-
-  VLOG(4) << "server_ shutdown!";
-  server_->Shutdown();
-}
-
-void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
-                                          int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
-    return;
-  }
-
-  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
-          << " REQ ID: " << req_id;
-
-  auto& reqs = rpc_reqs_[rpc_name];
-  auto& handler = rpc_call_map_[rpc_name];
-  auto& cq = rpc_cq_[rpc_name];
-
-  RequestBase* b = nullptr;
-  if (rpc_name == kRequestSend) {
-    b = new RequestSend(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGet) {
-    b = new RequestGet(service_.get(), cq.get(), handler, req_id);
-
-  } else if (rpc_name == kRequestGetNoBarrier) {
-    b = new RequestGetNoBarrier(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGetMonomerVariable) {
-    b = new RequestGetMonomerVariable(service_.get(), cq.get(), handler, req_id,
-                                      this);
-  } else if (rpc_name == kRequestGetMonomerBarrier) {
-    b = new RequestGetMonomerBarrier(service_.get(), cq.get(), handler, req_id,
-                                     this);
-  } else if (rpc_name == kRequestPrefetch) {
-    b = new RequestPrefetch(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestCheckpoint) {
-    b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestNotify) {
-    b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestSendAndRecv) {
-    b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
-  } else {
-    PADDLE_THROW(
-        platform::errors::InvalidArgument("not supported rpc: %s", rpc_name));
-  }
-
-  reqs[req_id] = b;
-
-  VLOG(4) << "TryToRegisterNewOne status:" << b->Status();
-}
-
-void AsyncGRPCServer::HandleRequest(
-    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
-  void* tag = NULL;
-  bool ok = false;
-
-  while (true) {
-    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
-    if (!cq->Next(&tag, &ok)) {
-      VLOG(4) << "CompletionQueue " << rpc_name << " shutdown!";
-      break;
-    }
-
-    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
-            << " get next";
-
-    auto& reqs = rpc_reqs_[rpc_name];
-    RequestBase* base = nullptr;
-    {
-      PADDLE_ENFORCE_EQ(
-          (req_id >= 0 && req_id < kRequestBufSize), true,
-          platform::errors::OutOfRange("request id: %s out of bounds: [0, %s)",
-                                       req_id, kRequestBufSize));
-      std::unique_lock<std::mutex> lock(cq_mutex_);
-      base = reqs[req_id];
-    }
-
-    VLOG(3) << base->Status2String(rpc_name);
-
-    // reference:
-    // https://github.com/tensorflow/tensorflow/issues/5596
-    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
-    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
-    if (!ok) {
-      VLOG(4) << "completion queue:" << rpc_name << " recv no regular event"
-              << " context:" << base->Status2String(rpc_name);
-      TryToRegisterNewOne(rpc_name, req_id);
-      delete base;
-      continue;
-    }
-
-    switch (base->Status()) {
-      case PROCESS: {
-        base->Process();
-        break;
-      }
-      case FINISH: {
-        TryToRegisterNewOne(rpc_name, req_id);
-        delete base;
-        break;
-      }
-      default: { assert(false); }
-    }
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h
deleted file mode 100644
index 3d68b7e8cebb40..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_service.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace grpc {
-class ServerCompletionQueue;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestBase;
-
-class AsyncGRPCServer final : public RPCServer {
- public:
-  explicit AsyncGRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncGRPCServer() {}
-  void WaitServerReady() override;
-  void StartServer() override;
-
- private:
-  // HandleRequest needs to be thread-safe.
-  void HandleRequest(
-      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-      std::function<void(const std::string&, int)> TryToRegisterNewOne);
-
-  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
-  void ShutdownQueue();
-  void ShutDownImpl() override;
-
- private:
-  static const int kRequestBufSize = 100;
-
-  std::mutex cq_mutex_;
-  volatile bool is_shut_down_ = false;
-
-  std::unique_ptr<GrpcService::AsyncService> service_;
-  std::unique_ptr<::grpc::Server> server_;
-
-  // condition of the sub program
-  std::condition_variable barrier_condition_;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-
-  int ready_;
-
-  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
-  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
-  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
deleted file mode 100644
index 10037c90853deb..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ /dev/null
@@ -1,145 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <grpc++/impl/codegen/async_stream.h>
-#include <grpc++/impl/codegen/async_unary_call.h>
-#include <grpc++/impl/codegen/proto_utils.h>
-#include <grpc++/impl/codegen/rpc_method.h>
-#include <grpc++/impl/codegen/service_type.h>
-#include <grpc++/impl/codegen/status.h>
-#include <grpc++/impl/codegen/stub_options.h>
-#include <grpc++/impl/codegen/sync_stream.h>
-#include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
-
-// NOTE: This method was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       method and did some modifications so that we can parse gRPC
-//       requests without too much copying of the tensor data.
-
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-
-// Support parsing/unparsing of tensorflow::VariableResponse.
-// Wire-format is identical to RecvVariableResponse.
-template <>
-class SerializationTraits<
-    paddle::operators::distributed::GRPCVariableResponse> {
- public:
-  static Status Serialize(
-      const paddle::operators::distributed::GRPCVariableResponse& msg,
-      grpc_byte_buffer** bp, bool* own_buffer) {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
-        "SerializationTraits::Serialize not implemented!"));
-    return Status();
-  }
-  static Status Deserialize(
-      grpc_byte_buffer* buffer,
-      paddle::operators::distributed::GRPCVariableResponse* msg,
-      int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-
-    Status result = g_core_codegen_interface->ok();
-    if (result.ok()) {
-      paddle::operators::distributed::GrpcByteSource source(buffer);
-      int ret = msg->Parse(&source);
-      if (ret != 0) {
-        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
-      }
-    }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum class GrpcMethod {
-  kSendVariable,
-  kGetVariable,
-  kPrefetchVariable,
-  kCheckpointNotify,
-  kGetVariableNoBarrier,
-  kGetMonomerVariable,
-  kGetMonomerBarrier,
-  kRequestNotify,
-  kRequestSendAndRecv,
-  // when you add new handler, change kGrpcNumMethods at the same time!
-};
-
-static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;
-
-inline const char* GrpcMethodName(GrpcMethod id) {
-  switch (id) {
-    case GrpcMethod::kSendVariable:
-      return "/sendrecv.SendRecvService/SendVariable";
-    case GrpcMethod::kGetVariable:
-      return "/sendrecv.SendRecvService/GetVariable";
-    case GrpcMethod::kGetVariableNoBarrier:
-      return "/sendrecv.SendRecvService/GetVariableNoBarrier";
-    case GrpcMethod::kGetMonomerVariable:
-      return "/sendrecv.SendRecvService/GetMonomerVariable";
-    case GrpcMethod::kGetMonomerBarrier:
-      return "/sendrecv.SendRecvService/GetMonomerBarrier";
-    case GrpcMethod::kPrefetchVariable:
-      return "/sendrecv.SendRecvService/PrefetchVariable";
-    case GrpcMethod::kCheckpointNotify:
-      return "/sendrecv.SendRecvService/CheckpointNotify";
-    case GrpcMethod::kRequestNotify:
-      return "/sendrecv.SendRecvService/DistributeNotify";
-    case GrpcMethod::kRequestSendAndRecv:
-      return "/sendrecv.SendRecvService/SendAndRecvVariable";
-  }
-
-  // Shouldn't be reached.
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "Invalid id: not found valid method name"));
-  return nullptr;
-}
-
-class GrpcService final {
- public:
-  class AsyncService : public ::grpc::Service {
-   public:
-    AsyncService() {
-      for (int i = 0; i < kGrpcNumMethods; ++i) {
-        AddMethod(new ::grpc::internal::RpcServiceMethod(
-            GrpcMethodName(static_cast<GrpcMethod>(i)),
-            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
-        ::grpc::Service::MarkMethodAsync(i);
-      }
-    }
-    virtual ~AsyncService() {}
-
-    // Make RequestAsyncUnary public for grpc_call.h
-    using ::grpc::Service::RequestAsyncUnary;
-  };
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
deleted file mode 100644
index f7679e9fc924df..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdint.h>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace google {
-namespace protobuf {
-namespace io {
-class ZeroCopyInputStream;
-}  // namespace io
-}  // namespace protobuf
-}  // namespace google
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
-
-        return false;
-      }
-      default: { return false; }
-    }
-  }
-
-  return true;
-}
-
-int GRPCVariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return tag;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!ProcSerializedField(tag, &input, num_bytes)) {
-          return tag;
-        }
-
-        break;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       platform::errors::PreconditionNotMet(
-                           "meta info should be got first!"));
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path,
-                              listener_id));
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kTrainerIdFieldNumber: {
-        uint64_t trainer_id = 0;
-        if (!input.ReadVarint64(&trainer_id)) {
-          return tag;
-        }
-        meta_.set_trainer_id(trainer_id);
-        break;
-      }
-      case sendrecv::VariableMessage::kTableNameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_table_name(temp);
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
deleted file mode 100644
index 4d12b4a4bacd7f..00000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class GRPCVariableResponse : public VariableResponse {
- public:
-  GRPCVariableResponse(const framework::Scope* scope,
-                       const platform::DeviceContext* dev_ctx,
-                       bool create_scope = false)
-      : VariableResponse(scope, dev_ctx, create_scope) {}
-
-  virtual ~GRPCVariableResponse() {}
-
-  int Parse(Source* source) override;
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
deleted file mode 100644
index 9f537f53348986..00000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-
-#include <ctime>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-DEFINE_int32(worker_update_interval_secs, 900,
-             " the longest time interval between the worker update variables");
-
-inline int GetCurrentUS() {
-  // current date/time based on current system
-  time_t t = std::time(0);
-  int now = static_cast<int>(t);
-  return now;
-}
-
-void HeartBeatMonitor::Update(const int worker_id, std::string be_monitored_var,
-                              WorkerStatus status) {
-  if (status == UNINITED) {
-    LOG(WARNING) << "HeartBeatMonitor receive UNINITED status can not be used "
-                    "in Update, something error";
-  }
-
-  if (!is_chief_) {
-    return;
-  }
-
-  if ((be_monitored_var == be_monitored_var_ && status == RUNNING) ||
-      status == COMPLETED) {
-    auto timestamp = GetCurrentUS();
-    UnderMonitoredWorker& worker = worker_status_map_.at(worker_id);
-
-    if (worker.status != COMPLETED) {
-      worker.status = status;
-    }
-    worker.timestamp = timestamp;
-    return;
-  }
-}
-
-void HeartBeatMonitor::LostWorkerMonitor() {
-  VLOG(1) << "worker heartbeat monitor start at No.0 parameter server";
-  while (running_) {
-    for (int id = 0; id < workers_; ++id) {
-      auto& worker = worker_status_map_.at(id);
-
-      if (worker.status == UNINITED) {
-        VLOG(4) << "worker " << worker.id << " is under UNINITED";
-        continue;
-      }
-      if (worker.status == COMPLETED) {
-        VLOG(4) << "worker " << worker.id << " is under COMPLETED";
-        continue;
-      }
-
-      auto timestamp = GetCurrentUS();
-
-      VLOG(4) << "worker " << worker.id << " status is " << worker.status
-              << " timestamp is " << worker.timestamp << " the interval is "
-              << timestamp - worker.timestamp;
-
-      if (timestamp - worker.timestamp >= FLAGS_worker_update_interval_secs) {
-        PADDLE_THROW(platform::errors::ExecutionTimeout(
-            "the latest update of worker %d is %d secs ago, we doubt the "
-            "the worker is not alive and this may have a bad effect on the "
-            "fitting result, please check",
-            worker.id, FLAGS_worker_update_interval_secs));
-      }
-    }
-
-    std::this_thread::sleep_for(std::chrono::milliseconds(10 * 1000));
-  }
-  VLOG(1) << "worker heartbeat monitor stopped, thread exit";
-}
-
-std::once_flag HeartBeatMonitor::init_flag_;
-std::unique_ptr<HeartBeatMonitor> HeartBeatMonitor::monitor_(nullptr);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h
deleted file mode 100644
index d96433c318b357..00000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.h
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum WorkerStatus { UNINITED = 0, RUNNING, COMPLETED };
-
-struct UnderMonitoredWorker {
-  int id;
-  WorkerStatus status;
-  int timestamp;
-
-  UnderMonitoredWorker() {}
-
-  explicit UnderMonitoredWorker(int worker_id) {
-    this->id = worker_id;
-    this->status = UNINITED;
-    this->timestamp = 0;
-  }
-};
-
-class HeartBeatMonitor {
- public:
-  explicit HeartBeatMonitor(int workers, bool is_chief,
-                            std::string be_monitored_var)
-      : workers_(workers),
-        is_chief_(is_chief),
-        be_monitored_var_(be_monitored_var),
-        running_(true) {
-    PADDLE_ENFORCE_GT(workers, 0, platform::errors::InvalidArgument(
-                                      "workers must greater than 0."));
-
-    for (auto worker_id = 0; worker_id < workers; worker_id++) {
-      UnderMonitoredWorker worker(worker_id);
-      worker_status_map_[worker_id] = std::move(worker);
-    }
-
-    // we define the No.0 pserver is the first parameter server
-    // only No.0 will check the heartbeat of all trainers
-    if (is_chief) {
-      monitor_thread_.reset(new std::thread(
-          std::bind(&HeartBeatMonitor::LostWorkerMonitor, this)));
-    }
-  }
-
-  ~HeartBeatMonitor() {
-    running_ = false;
-    if (monitor_thread_) monitor_thread_->join();
-  }
-
-  static void Init(int workers, bool is_chief, std::string be_monitored_var) {
-    std::call_once(init_flag_, &HeartBeatMonitor::InitImpl, workers, is_chief,
-                   be_monitored_var);
-  }
-
-  static HeartBeatMonitor* GetInstance() { return monitor_.get(); }
-
-  void Stop() {
-    running_ = false;
-    if (!monitor_) {
-      VLOG(0) << "HeartBeatMonitor is not inited, do nothing";
-    } else {
-      if (monitor_thread_) {
-        monitor_thread_->join();
-        monitor_thread_.reset(nullptr);
-      }
-    }
-  }
-
-  void Update(const int worker_id, std::string be_monitored_var,
-              WorkerStatus status);
-
-  void LostWorkerMonitor();
-
- private:
-  // Init is called by GetInstance.
-  static void InitImpl(int workers, bool is_chief,
-                       std::string be_monitored_var) {
-    if (monitor_ == nullptr) {
-      monitor_.reset(new HeartBeatMonitor(workers, is_chief, be_monitored_var));
-    }
-  }
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<HeartBeatMonitor> monitor_;
-
-  int workers_;
-  bool is_chief_;
-  std::string be_monitored_var_;
-  std::unordered_map<int, UnderMonitoredWorker> worker_status_map_;
-  std::unique_ptr<std::thread> monitor_thread_{nullptr};
-  std::mutex mutex_;
-  bool running_ = false;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
deleted file mode 100644
index 8505023f63a95d..00000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void run(HeartBeatMonitor* monitor) { monitor->LostWorkerMonitor(); }
-
-TEST(HeartBeatMonitor, All) {
-  int trainers = 10;
-  int pserver_id = 0;
-  std::string var = "nce_w@GRAD.block0";
-  std::string var2 = "nce_w@GRAD.block2";
-
-  HeartBeatMonitor::Init(trainers, pserver_id == 0, var);
-
-  auto* monitor = HeartBeatMonitor::GetInstance();
-
-  std::vector<int> ids{1, 3, 5, 7};
-
-  for (auto& id : ids) {
-    monitor->Update(id, var, RUNNING);
-  }
-
-  monitor->Update(9, var2, RUNNING);
-  monitor->Update(2, var, COMPLETED);
-
-  std::thread t(run, monitor);
-  t.detach();
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(15 * 1000));
-
-  monitor->Stop();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
deleted file mode 100644
index da2281231fc8a3..00000000000000
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ /dev/null
@@ -1,848 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum Mode { training, infer };
-enum InitType { uniform_random, fill_constant, gaussian_random };
-
-inline std::vector<int> bucket(const int v_size, const int b_size) {
-  int remainder = v_size % b_size;
-  int bucket = v_size / b_size;
-  std::vector<int> ret_vec(b_size, bucket);
-  for (int i = 0; i < remainder; ++i) {
-    ret_vec[i] = ret_vec[i] + 1;
-  }
-  int cur_bucket = 0;
-  for (int &j : ret_vec) {
-    int tmp = j;
-    j = cur_bucket;
-    cur_bucket += tmp;
-  }
-  ret_vec.push_back(cur_bucket);
-  return ret_vec;
-}
-
-class Initializer {
- public:
-  Initializer() {}
-
-  explicit Initializer(const std::vector<std::string> &attrs) {}
-
-  virtual float GetValue() = 0;
-
-  virtual ~Initializer() {}
-
- protected:
-  std::string name_;
-  unsigned int seed_;
-};
-
-class UniformInitializer : public Initializer {
- public:
-  explicit UniformInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
-    min_ = std::stof(attrs[2]);
-    max_ = std::stof(attrs[3]);
-
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
-    random_engine_ = framework::GetCPURandomEngine(seed_);
-  }
-
-  float GetValue() override { return dist_(*random_engine_); }
-
- private:
-  float min_;
-  float max_;
-
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::uniform_real_distribution<float> dist_;
-};
-
-template <typename T>
-inline bool entry(const int count, const T threshold);
-
-template <>
-inline bool entry<std::string>(const int count, const std::string threshold) {
-  return true;
-}
-
-template <>
-inline bool entry<int>(const int count, const int threshold) {
-  return count >= threshold;
-}
-
-template <>
-inline bool entry<float>(const int count, const float threshold) {
-  UniformInitializer uniform = UniformInitializer({"0", "0", "1"});
-  return uniform.GetValue() >= threshold;
-}
-
-class GaussianInitializer : public Initializer {
- public:
-  explicit GaussianInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
-    mean_ = std::stof(attrs[2]);
-    std_ = std::stof(attrs[3]);
-
-    random_engine_ = framework::GetCPURandomEngine(seed_);
-
-    dist_ = std::normal_distribution<float>(mean_, std_);
-  }
-
-  float GetValue() override { return dist_(*random_engine_); }
-
- private:
-  float std_;
-  float mean_;
-
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::normal_distribution<float> dist_;
-};
-
-class FillConstantInitializer : public Initializer {
- public:
-  explicit FillConstantInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    value_ = std::stof(attrs[1]);
-  }
-
-  float GetValue() override { return value_; }
-
- private:
-  float value_;
-};
-
-struct SparseMeta {
-  std::string name;
-  std::string grad_name;
-  std::vector<std::string> value_names;
-  std::vector<int> value_dims;
-  std::vector<std::string> cached_varnames;
-  std::vector<std::string> initializer_attrs;
-  std::string entry;
-  Mode mode;
-
-  std::string ToString() {
-    std::stringstream ss;
-    ss << "name: " << name << " ";
-    ss << "mode: " << mode << " ";
-
-    for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
-      ss << "value_name: " << value_names[i] << " dim: " << value_dims[i]
-         << " ";
-    }
-
-    ss << " grad var: " << grad_name;
-
-    ss << " cached varnames: ";
-    for (int i = 0; i < static_cast<int>(cached_varnames.size()); i++) {
-      ss << cached_varnames[i] << " ";
-    }
-
-    ss << " initializer attrs: ";
-    for (int i = 0; i < static_cast<int>(initializer_attrs.size()); i++) {
-      ss << initializer_attrs[i] << " ";
-    }
-
-    ss << " entry attrs: " << entry;
-
-    return ss.str();
-  }
-};
-
-struct VALUE {
-  explicit VALUE(const std::vector<std::string> &names)
-      : names_(names), count_(0), unseen_days_(0) {
-    values_.resize(names.size());
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      places[names[i]] = i;
-    }
-  }
-
-  void set(std::vector<std::vector<float>> *values) {
-    values_ = std::move(*values);
-  }
-
-  void set(const std::vector<std::string> &names,
-           const std::vector<std::vector<float>> &values) {
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      auto idx = places[names[i]];
-      auto value = values[i];
-      values_[idx].assign(value.begin(), value.end());
-    }
-  }
-
-  std::vector<std::vector<float> *> get() {
-    auto pts = std::vector<std::vector<float> *>();
-    pts.reserve(values_.size());
-
-    for (auto &value : values_) {
-      pts.push_back(&value);
-    }
-    return pts;
-  }
-
-  int fetch_count() { return ++count_; }
-  void reset_unseen_days() { unseen_days_ = 0; }
-
-  void set_entry(bool is_entry) { is_entry_ = is_entry; }
-
-  bool get_entry() { return is_entry_; }
-
-  std::vector<std::vector<float> *> get(const std::vector<std::string> names) {
-    auto pts = std::vector<std::vector<float> *>();
-    pts.reserve(values_.size());
-
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      pts.push_back(&(values_[places[names[i]]]));
-    }
-    return pts;
-  }
-
-  std::vector<std::string> names_;
-  int count_;
-  bool seen_after_last_save_;
-  int unseen_days_;
-  bool is_entry_;
-  std::vector<std::vector<float>> values_;
-  std::unordered_map<std::string, int> places;
-};
-
-class ValueBlock {
- public:
-  explicit ValueBlock(const std::vector<std::string> value_names,
-                      const std::vector<int> value_dims, const Mode &mode,
-                      const std::vector<std::string> &init_attrs,
-                      const std::string &entry_attr)
-      : value_names_(value_names), value_dims_(value_dims), mode_(mode) {
-    // for Initializer
-    for (size_t i = 0; i < value_names.size(); i++) {
-      auto name = value_names[i];
-      auto slices = string::split_string<std::string>(init_attrs[i], "&");
-
-      if (slices[0] == "gaussian_random") {
-        initializers_[name] = new GaussianInitializer(slices);
-      } else if (slices[0] == "fill_constant") {
-        initializers_[name] = new FillConstantInitializer(slices);
-      } else if (slices[0] == "uniform_random") {
-        initializers_[name] = new UniformInitializer(slices);
-      } else {
-        PADDLE_THROW(
-            platform::errors::InvalidArgument("%s can not be supported", name));
-      }
-    }
-
-    // for Entry
-    {
-      if (entry_attr == "none") {
-        entry_func_ =
-            std::bind(entry<std::string>, std::placeholders::_1, "none");
-      } else {
-        auto slices = string::split_string<std::string>(entry_attr, "&");
-        if (slices[0] == "count_filter") {
-          int threshold = std::stoi(slices[1]);
-          entry_func_ = std::bind(entry<int>, std::placeholders::_1, threshold);
-        } else if (slices[0] == "probability") {
-          float threshold = std::stof(slices[1]);
-          entry_func_ =
-              std::bind(entry<float>, std::placeholders::_1, threshold);
-        }
-      }
-    }
-
-    rwlock_.reset(new framework::RWLock);
-  }
-
-  ~ValueBlock() {
-    //    for (auto init : initializers_) {
-    //      delete init.second;
-    //      initializers_.erase(init.first);
-    //    }
-    //
-    //    for (auto value : values_) {
-    //      delete value.second;
-    //      values_.erase(value.first);
-    //    }
-  }
-
-  void Init(const int64_t &id, std::vector<std::vector<float>> *values,
-            int count) {
-    if (Has(id)) {
-      PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error"));
-    }
-
-    if (values->size() != value_names_.size()) {
-      PADDLE_THROW(
-          platform::errors::AlreadyExists("values can not match, error"));
-    }
-
-    auto value = new VALUE(value_names_);
-    value->set(values);
-    value->seen_after_last_save_ = true;
-    value->count_ = count;
-    values_[id] = value;
-  }
-
-  std::vector<std::vector<float> *> Get(
-      const int64_t &id, const std::vector<std::string> &value_names) {
-    rwlock_->RDLock();
-    auto ret_values = values_.at(id)->get(value_names);
-    rwlock_->UNLock();
-    return ret_values;
-  }
-
-  void InitFromInitializer(const int64_t &id,
-                           const std::vector<std::string> &value_names) {
-    rwlock_->WRLock();
-
-    if (Has(id)) {
-      Update(id);
-      rwlock_->UNLock();
-      return;
-    }
-
-    auto rets = std::vector<std::vector<float>>();
-    rets.resize(value_names_.size());
-
-    for (int i = 0; i < static_cast<int>(value_names_.size()); i++) {
-      auto name = value_names_[i];
-      auto *init = initializers_.at(name);
-
-      auto dim = value_dims_[i];
-      rets[i].resize(dim);
-
-      for (int j = 0; j < static_cast<int>(dim); j++) {
-        rets[i][j] = init->GetValue();
-      }
-    }
-
-    Init(id, &rets, 0);
-    Update(id);
-    rwlock_->UNLock();
-  }
-
-  bool GetEntry(const int64_t &id) {
-    rwlock_->RDLock();
-    auto value = values_.at(id);
-    auto entry = value->get_entry();
-    rwlock_->UNLock();
-    return entry;
-  }
-
-  void Set(const int64_t &id, const std::vector<std::string> &value_names,
-           const std::vector<std::vector<float>> &values) {
-    rwlock_->WRLock();
-    auto value = values_.at(id);
-    value->set(value_names, values);
-    rwlock_->UNLock();
-  }
-
-  void Update(const int64_t id) {
-    auto *value = values_.at(id);
-    value->reset_unseen_days();
-    auto count = value->fetch_count();
-
-    if (!value->get_entry()) {
-      value->set_entry(entry_func_(count));
-    }
-  }
-
- private:
-  bool Has(const int64_t id) {
-    auto got = values_.find(id);
-    if (got == values_.end()) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
- public:
-  std::unordered_map<int64_t, VALUE *> values_;
-
- private:
-  std::vector<std::string> value_names_;
-  std::vector<int> value_dims_;
-  Mode mode_;
-  std::function<bool(int64_t)> entry_func_;
-  std::unordered_map<std::string, Initializer *> initializers_;
-  std::unique_ptr<framework::RWLock> rwlock_{nullptr};
-};
-
-class SparseVariable {
- public:
-  explicit SparseVariable(const SparseMeta &meta) {
-    meta_.name = meta.name;
-    meta_.mode = meta.mode;
-    meta_.value_names = meta.value_names;
-    meta_.value_dims = meta.value_dims;
-    meta_.grad_name = meta.grad_name;
-    meta_.cached_varnames = meta.cached_varnames;
-    meta_.initializer_attrs = meta.initializer_attrs;
-    meta_.entry = meta.entry;
-
-    for (int i = 0; i < static_cast<int>(meta_.value_names.size()); i++) {
-      values_dims_[meta_.value_names[i]] = meta_.value_dims[i];
-    }
-
-    for (size_t i = 0; i < shard_num_; i++) {
-      auto block = std::make_shared<ValueBlock>(
-          meta.value_names, meta.value_dims, meta.mode, meta.initializer_attrs,
-          meta.entry);
-      shard_blocks_.emplace_back(block);
-    }
-
-    rwlock_.reset(new framework::RWLock);
-  }
-
-  void Init(const std::vector<int64_t> &ids) {
-    rwlock_->RDLock();
-    for (auto &id : ids) {
-      auto *block = GetShard(id);
-      block->InitFromInitializer(id, meta_.value_names);
-    }
-    rwlock_->UNLock();
-  }
-
-  void Get(const std::vector<int64_t> &ids,
-           const std::vector<std::string> &value_names,
-           std::vector<std::vector<std::vector<float> *>> *values) {
-    values->resize(ids.size());
-
-    auto buckets = bucket(ids.size(), 8);
-    std::vector<std::future<void>> fs;
-
-    for (int j = 0; j < 8; ++j) {
-      auto begin = buckets[j];
-      auto end = buckets[j + 1];
-
-      fs.push_back(
-          framework::Async([begin, end, &values, &ids, &value_names, this]() {
-            for (int x = begin; x < end; x++) {
-              auto id = ids[x];
-              auto *block = GetShard(id);
-              auto id_values = block->Get(id, value_names);
-              (*values)[x] = id_values;
-            }
-          }));
-    }
-
-    for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-  }
-
-  void GetEntry(const std::vector<int64_t> &ids, std::vector<int64_t> *values) {
-    auto buckets = bucket(ids.size(), 8);
-    std::vector<std::future<void>> fs;
-
-    for (int j = 0; j < 8; ++j) {
-      auto begin = buckets[j];
-      auto end = buckets[j + 1];
-
-      fs.push_back(framework::Async([begin, end, &values, &ids, this]() {
-        for (int x = begin; x < end; x++) {
-          auto id = ids[x];
-          auto *block = GetShard(id);
-          auto is_entry = block->GetEntry(id);
-
-          if (!is_entry) {
-            values->push_back(id);
-          }
-        }
-      }));
-    }
-    for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-  }
-
-  void Set(const std::vector<int64_t> &ids,
-           const std::vector<std::string> &value_names,
-           const std::vector<std::vector<std::vector<float>>> &values) {
-    for (int i = 0; i < static_cast<int>(ids.size()); i++) {
-      GetShard(ids[i])->Set(ids[i], value_names, values[i]);
-    }
-  }
-
-  void Dims(std::vector<std::string> value_names, std::vector<int64_t> *dims) {
-    for (auto &name : value_names) {
-      dims->push_back(values_dims_.at(name));
-    }
-  }
-
-  std::vector<std::string> CachedVarnames() const {
-    return meta_.cached_varnames;
-  }
-
-  void Load(const std::string &dirname) {
-    rwlock_->WRLock();
-    VLOG(1) << "load " << meta_.name << " from dir: " << dirname << " begin";
-
-    std::vector<std::string> filenames;
-    for (auto &value_name : meta_.value_names) {
-      auto filename = string::Sprintf("%s/%s", dirname, value_name);
-      filenames.push_back(filename);
-    }
-
-    LoadFromSelectedRows(filenames, meta_.value_names);
-    VLOG(1) << "load " << meta_.name << " in dir: " << dirname << " done";
-    rwlock_->UNLock();
-  }
-
-  void LoadFromSelectedRows(const std::vector<std::string> &filenames,
-                            const std::vector<std::string> &valuenames) {
-    std::vector<std::shared_ptr<framework::Variable>> variables;
-    auto place = platform::CPUPlace();
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto var = std::make_shared<framework::Variable>();
-      variables.push_back(var);
-      auto &filename = filenames[i];
-      std::ifstream fin(filename, std::ios::binary);
-      auto *selectedRows = var->GetMutable<framework::SelectedRows>();
-
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-
-      framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-      selectedRows->SyncIndex();
-    }
-
-    std::vector<const float *> tensors;
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto &slr = variables[i]->Get<framework::SelectedRows>();
-      auto src_t = slr.value();
-      const auto *value = src_t.data<float>();
-      tensors.push_back(value);
-    }
-
-    for (int i = 1; i < static_cast<int>(filenames.size()); i++) {
-      auto rows_0 = variables[0]->Get<framework::SelectedRows>().rows();
-      auto rows_i = variables[i]->Get<framework::SelectedRows>().rows();
-
-      bool is_equal = std::equal(rows_0.begin(), rows_0.end(), rows_i.begin());
-
-      if (!is_equal) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s and %s are not equal, can not be load rightly", filenames[0],
-            filenames[i]));
-      }
-    }
-
-    auto rows = variables[0]->Get<framework::SelectedRows>().rows();
-
-    for (auto i = 0; i < static_cast<int64_t>(rows.size()); i++) {
-      auto id = rows[i];
-      std::vector<std::vector<float>> values;
-      values.resize(filenames.size());
-
-      for (int j = 0; j < static_cast<int>(filenames.size()); ++j) {
-        values[j].resize(meta_.value_dims[j]);
-        std::memcpy(values[j].data(), tensors[j] + i * meta_.value_dims[j],
-                    sizeof(float) * meta_.value_dims[j]);
-      }
-
-      auto *block = GetShard(id);
-      block->Init(id, &values, 0);
-      block->Update(id);
-    }
-  }
-
-  void Save(const std::string &dirname, const int mode = 0) {
-    rwlock_->WRLock();
-    VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " begin";
-
-    MkDirRecursively(dirname.c_str());
-
-    std::vector<std::string> filenames;
-    for (auto &value_name : meta_.value_names) {
-      auto filename = string::Sprintf("%s/%s", dirname, value_name);
-      filenames.push_back(filename);
-    }
-
-    SaveToSelectedRows(filenames, meta_.value_names, mode);
-    VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " done";
-    rwlock_->UNLock();
-  }
-
-  void SaveToSelectedRows(const std::vector<std::string> &filenames,
-                          const std::vector<std::string> &valuenames,
-                          const int mode) {
-    for (auto &value_name : valuenames) {
-      auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(),
-                          value_name);
-      if (it == meta_.value_names.end()) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "[%s] is invalid param for [%s]", value_name, meta_.name));
-      }
-    }
-
-    auto place = platform::CPUPlace();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    std::vector<int64_t> ids;
-
-    for (auto &block : shard_blocks_) {
-      for (auto value : block->values_) {
-        if (mode == 0) {
-          ids.push_back(value.first);
-        } else {
-          bool id_need_save = false;
-          // save all params
-          if (mode == 1) {
-            id_need_save = true;
-          } else {
-            id_need_save = value.second->seen_after_last_save_;
-          }
-
-          if (id_need_save) {
-            ids.push_back(value.first);
-          }
-          value.second->seen_after_last_save_ = false;
-        }
-      }
-    }
-
-    VLOG(3) << "save " << ids.size() << " feasigns for " << meta_.name
-            << " with mode: " << mode;
-
-    std::vector<std::shared_ptr<framework::Variable>> variables;
-    std::vector<float *> tensors;
-    std::vector<int64_t> dims;
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto dim = values_dims_.at(valuenames[i]);
-      auto var = std::make_shared<framework::Variable>();
-      auto *slr = var->GetMutable<framework::SelectedRows>();
-      auto *src_t = slr->mutable_value();
-
-      src_t->Resize({static_cast<int64_t>(ids.size()), dim});
-      auto *value = src_t->mutable_data<float>(place);
-
-      dims.push_back(dim);
-      variables.push_back(var);
-      tensors.push_back(value);
-    }
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    Get(ids, valuenames, &values);
-
-    int64_t offset = 0;
-    for (auto &vss : values) {
-      for (int i = 0; i < static_cast<int>(vss.size()); i++) {
-        auto &vs = vss[i];
-        std::memcpy(tensors[i] + offset * dims[i], vs->data(),
-                    sizeof(float) * dims[i]);
-      }
-      offset += 1;
-    }
-
-    for (auto &var : variables) {
-      auto *slr = var->GetMutable<framework::SelectedRows>();
-      slr->set_rows(ids);
-      slr->set_height(ids.size());
-    }
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto &filename = filenames[i];
-      auto &selectedRows = variables[i]->Get<framework::SelectedRows>();
-
-      std::ofstream fout(filename, std::ios::binary);
-      PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
-                        platform::errors::Unavailable(
-                            "Cannot open %s to save variables.", filename));
-
-      framework::SerializeToStream(fout, selectedRows, dev_ctx);
-      fout.close();
-    }
-  }
-
-  void SaveToText(const std::vector<std::string> &filenames,
-                  const std::vector<std::string> &valuenames) {
-    for (auto &value_name : valuenames) {
-      auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(),
-                          value_name);
-      if (it == meta_.value_names.end()) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "[%s] is invalid param for [%s]", value_name, meta_.name));
-      }
-    }
-
-    std::vector<std::unique_ptr<std::ofstream>> fouts;
-
-    for (auto filename : filenames) {
-      std::unique_ptr<std::ofstream> fout(new std::ofstream(filename));
-      fouts.push_back(std::move(fout));
-    }
-
-    for (auto &block : shard_blocks_) {
-      for (auto value : block->values_) {
-        std::vector<std::vector<float> *> vss = value.second->get(valuenames);
-
-        auto id = value.first;
-
-        for (int i = 0; i < static_cast<int>(vss.size()); i++) {
-          auto &vs = vss[i];
-          std::stringstream ss;
-          ss << id << "\t";
-          ss << vs->size() << "\t";
-          for (auto v : (*vs)) {
-            ss << v << " ";
-          }
-          ss << "\n";
-
-          fouts[i]->write(ss.str().c_str(), sizeof(char) * ss.str().size());
-        }
-      }
-    }
-
-    for (int i = 0; i < static_cast<int>(fouts.size()); i++) {
-      fouts[i]->close();
-    }
-  }
-
-  int64_t Size() {
-    int64_t cnt = 0;
-
-    for (auto &block : shard_blocks_) {
-      cnt += block->values_.size();
-    }
-    return cnt;
-  }
-
-  ValueBlock *GetShard(const int64_t id) {
-    return shard_blocks_[id & shard_mask_].get();
-  }
-
-  SparseMeta *GetMeta() { return &meta_; }
-
- private:
-  std::unique_ptr<framework::RWLock> rwlock_{nullptr};
-
-  SparseMeta meta_;
-  std::unordered_map<std::string, int64_t> values_dims_;
-  const size_t shard_mask_ = 127;
-  const size_t shard_num_ = 128;
-  std::vector<std::shared_ptr<ValueBlock>> shard_blocks_;
-};
-
-class LargeScaleKV {
- public:
-  LargeScaleKV() {}
-
-  explicit LargeScaleKV(const std::vector<SparseMeta> &table_metas) {
-    for (auto &sparse_meta : table_metas) {
-      auto table_name = sparse_meta.name;
-      auto meta = std::shared_ptr<SparseVariable>(
-          new SparseVariable(std::move(sparse_meta)));
-      sparse_variables[table_name] = meta;
-      grad_to_variables[sparse_meta.grad_name] = table_name;
-      grad_names_.push_back(sparse_meta.grad_name);
-    }
-  }
-
-  ~LargeScaleKV() {}
-
-  static std::shared_ptr<LargeScaleKV> GetInstantcePtr() { return scale_kv_; }
-
-  static LargeScaleKV *GetInstance() { return scale_kv_.get(); }
-
-  static LargeScaleKV *InitInstance(
-      const std::vector<SparseMeta> &table_metas) {
-    std::call_once(init_flag_, &LargeScaleKV::Init, table_metas);
-    return scale_kv_.get();
-  }
-
-  static void Init(const std::vector<SparseMeta> &table_metas) {
-    if (scale_kv_.get() == nullptr) {
-      scale_kv_.reset(new LargeScaleKV(table_metas));
-    }
-  }
-
-  SparseVariable *Get(const std::string &name) {
-    auto variable = sparse_variables.at(name);
-    return variable.get();
-  }
-
-  bool ParamInLargeScale(const std::string &name) {
-    auto got = sparse_variables.find(name);
-
-    if (got == sparse_variables.end()) {
-      return false;
-    }
-
-    return true;
-  }
-
-  bool GradInLargeScale(const std::string &name) {
-    auto got = grad_to_variables.find(name);
-
-    if (got == grad_to_variables.end()) {
-      return false;
-    }
-
-    return true;
-  }
-
-  SparseVariable *GetByGrad(const std::string &name) {
-    return Get(grad_to_variables[name]);
-  }
-
-  const std::vector<std::string> &GetAllGrads() { return grad_names_; }
-
- private:
-  std::unordered_map<std::string, std::shared_ptr<SparseVariable>>
-      sparse_variables;
-  std::unordered_map<std::string, std::string> grad_to_variables;
-  std::vector<std::string> grad_names_;
-  static std::shared_ptr<LargeScaleKV> scale_kv_;
-  static std::once_flag init_flag_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
deleted file mode 100644
index 558d70e5c3353f..00000000000000
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include <memory>
-#include <set>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-static void SplitIdsIntoMultipleVarsBySection(
-    const std::vector<int64_t> &in_ids,
-    const std::vector<std::string> &in_varnames, const int tables,
-    const int pservers, const bool is_distibuted, framework::Scope *scope,
-    std::vector<std::vector<int64_t>> *splited_ids,
-    std::vector<std::vector<int64_t>> *origin_ids) {
-  PADDLE_ENFORCE_EQ(
-      in_varnames.size(), tables,
-      platform::errors::OutOfRange(
-          "send varnames size: %d not equal table number: %d, internal error",
-          in_varnames.size(), tables));
-
-  PADDLE_ENFORCE_LE(
-      tables, pservers,
-      platform::errors::OutOfRange("table number %d not equal or less than "
-                                   "pserver number: %d, internal error",
-                                   tables, pservers));
-
-  auto place = platform::CPUPlace();
-
-  std::set<int64_t> st(in_ids.begin(), in_ids.end());
-  std::vector<int64_t> all_ids;
-  all_ids.assign(st.begin(), st.end());
-
-  splited_ids->resize(tables);
-  origin_ids->resize(tables);
-
-  if (is_distibuted) {
-    for (auto &id : all_ids) {
-      auto pserver_id = id % pservers;
-      (*splited_ids)[pserver_id].push_back(id);
-      (*origin_ids)[pserver_id].push_back(id);
-    }
-  } else {
-    for (auto &id : all_ids) {
-      auto pserver_id = id % pservers;
-      (*origin_ids)[pserver_id].push_back(id);
-      id = id / pservers;
-      (*splited_ids)[pserver_id].push_back(id);
-    }
-  }
-
-  for (size_t i = 0; i < in_varnames.size(); ++i) {
-    auto *id_tensor =
-        scope->Var(in_varnames[i])->GetMutable<framework::LoDTensor>();
-
-    auto &ids = (*splited_ids)[i];
-    if (!ids.empty()) {
-      auto *id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
-    }
-  }
-}
-
-typedef std::vector<std::pair<std::string, std::string>> TableAndEndpoints;
-
-void prefetch_core(
-    const std::vector<int64_t> &ids, const TableAndEndpoints &tables,
-    const framework::ExecutionContext &context, const framework::Scope &scope,
-    const bool is_distributed,
-    std::unordered_map<int64_t, std::vector<float>> *recved_vec_map) {
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
-
-  int pservers = context.Attr<int>("pserver_num");
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &actual_ctx = *pool.Get(platform::CPUPlace());
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < tables.size(); ++i) {
-    in_var_names.push_back("prefetch_send@" + tables[i].second);
-    out_var_names.push_back("prefetch_recv@" + tables[i].second);
-  }
-
-  std::vector<std::vector<int64_t>> split_ids;
-  std::vector<std::vector<int64_t>> origin_ids;
-  SplitIdsIntoMultipleVarsBySection(ids, in_var_names, tables.size(), pservers,
-                                    is_distributed, local_scope.get(),
-                                    &split_ids, &origin_ids);
-
-  // create output var in local scope
-  for (auto &name : out_var_names) {
-    local_scope->Var(name)->GetMutable<framework::LoDTensor>();
-  }
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(*local_scope.get(), in_var_names[i])) {
-      VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second
-              << " to get " << out_var_names[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i],
-          out_var_names[i], tables[i].first));
-    } else {
-      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-  }
-
-  for (size_t o_idx = 0; o_idx < out_var_names.size(); ++o_idx) {
-    auto &ids_in_this_section = origin_ids[o_idx];
-
-    if (!ids_in_this_section.empty()) {
-      auto &prefetch_out_var =
-          local_scope->Var(out_var_names[o_idx])->Get<framework::LoDTensor>();
-      const auto *out_var_data = prefetch_out_var.data<float>();
-      auto &dims = prefetch_out_var.dims();
-
-      PADDLE_ENFORCE_EQ(dims.size(), 2,
-                        platform::errors::InvalidArgument(
-                            "The size of Tensor dims must be 2."));
-      PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0],
-                        platform::errors::InvalidArgument(
-                            "The size of ids in this section must equal to "
-                            "dims[0]: %s, but got %s",
-                            dims[0], ids_in_this_section.size()));
-
-      auto row_numel = dims[1];
-
-      for (int64_t i = 0; i < dims[0]; ++i) {
-        auto origin_id = ids_in_this_section[i];
-        std::vector<float> vecs(row_numel);
-
-        std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin());
-        (*recved_vec_map)[origin_id] = vecs;
-      }
-    } else {
-      VLOG(3) << "ids in this section is empty";
-    }
-  }
-}
-
-void prefetch(const std::string &id_name, const std::string &out_name,
-              const std::string &persistable_var_name,
-              const bool is_distributed,
-              const std::vector<std::string> &table_names,
-              const std::vector<std::string> &endpoints,
-              const framework::ExecutionContext &context,
-              const framework::Scope &scope) {
-  prefetchs({id_name}, {out_name}, persistable_var_name, is_distributed,
-            table_names, endpoints, context, scope);
-}
-
-void prefetchs(const std::vector<std::string> &id_var_names,
-               const std::vector<std::string> &out_var_names,
-               const std::string &persistable_var_name,
-               const bool is_distributed,
-               const std::vector<std::string> &table_names,
-               const std::vector<std::string> &endpoints,
-               const framework::ExecutionContext &context,
-               const framework::Scope &scope) {
-  auto vec_dim_1 = 0;
-  auto vec_dim_0 = 0;
-  framework::Variable *var = scope.FindVar(persistable_var_name);
-
-  if (var->IsType<SelectedRows>()) {
-    vec_dim_1 = var->Get<framework::SelectedRows>().value().dims()[1];
-  } else {
-    vec_dim_0 = var->Get<framework::LoDTensor>().dims()[0];
-    vec_dim_1 = var->Get<framework::LoDTensor>().dims()[1];
-  }
-
-  PADDLE_ENFORCE_GT(vec_dim_1, 0,
-                    platform::errors::InvalidArgument(
-                        "lookup table var's dim must gather than 0"));
-
-  const auto place =
-      scope.FindVar(id_var_names[0])->Get<framework::LoDTensor>().place();
-
-  std::vector<std::vector<int64_t>> ids_group;
-  std::vector<int64_t> ids_union;
-  std::vector<framework::LoD> ids_lods;
-  TableAndEndpoints tables;
-
-  for (auto &id_name : id_var_names) {
-    auto &id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
-    std::vector<int64_t> ids;
-    TensorToVector(id_tensor, context.device_context(), &ids);
-    ids_union.insert(ids_union.end(), ids.begin(), ids.end());
-    ids_group.push_back(ids);
-    ids_lods.push_back(id_tensor.lod());
-  }
-
-  std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
-  ids_union.assign(s.begin(), s.end());
-
-  for (auto &i : ids_union) {
-    PADDLE_ENFORCE_GE(
-        i, 0, platform::errors::OutOfRange(
-                  "each element in embedding should be larger or equal 0"));
-    if (!is_distributed) {
-      PADDLE_ENFORCE_LT(
-          i, vec_dim_0,
-          platform::errors::OutOfRange(
-              "embedding id must in [0, %d) when is_distributed False",
-              vec_dim_0));
-    }
-  }
-
-  for (size_t i = 0; i < table_names.size(); i++) {
-    tables.push_back(std::make_pair(table_names[i], endpoints[i]));
-  }
-  std::unordered_map<int64_t, std::vector<float>> recved_vec_map;
-  prefetch_core(ids_union, tables, context, scope, is_distributed,
-                &recved_vec_map);
-
-  auto padding_idx = distributed::kNoPadding;
-
-  if (context.HasAttr("padding_idx")) {
-    padding_idx = context.Attr<int64_t>("padding_idx");
-  }
-
-  for (size_t i = 0; i < out_var_names.size(); i++) {
-    std::vector<int64_t> ids = ids_group[i];
-    auto ids_size = ids.size();
-    auto *out_t =
-        scope.FindVar(out_var_names[i])->GetMutable<framework::LoDTensor>();
-    out_t->set_lod(ids_lods[i]);
-    out_t->Resize(
-        framework::make_ddim({static_cast<int64_t>(ids_size), vec_dim_1}));
-    auto *out_d = out_t->mutable_data<float>(place);
-
-    if (platform::is_cpu_place(out_t->place())) {
-      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-        const auto &id = ids[idx];
-        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-          memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
-        } else {
-          std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
-                      out_d + idx * vec_dim_1);
-        }
-      }
-    } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      std::vector<float> ids_value_vec(ids_size * vec_dim_1);
-      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-        const auto &id = ids[idx];
-        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-          memset(&ids_value_vec[idx * vec_dim_1], 0, sizeof(float) * vec_dim_1);
-        } else {
-          memcpy(&ids_value_vec[idx * vec_dim_1], &recved_vec_map[id][0],
-                 sizeof(float) * vec_dim_1);
-        }
-      }
-      auto &gpu_place = BOOST_GET_CONST(platform::CUDAPlace, out_t->place());
-      auto &cpu_place = BOOST_GET_CONST(
-          platform::CPUPlace, paddle::platform::CPUDeviceContext().GetPlace());
-      auto stream = context.cuda_device_context().stream();
-      memory::Copy(gpu_place, out_d, cpu_place, &ids_value_vec[0],
-                   sizeof(float) * ids_size * vec_dim_1, stream);
-#else
-      PADDLE_ENFORCE(true, platform::errors::PermissionDenied(
-                               "Paddle is not compiled with GPU!"));
-#endif
-    }
-  }
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
deleted file mode 100644
index 6fd3a998813c0b..00000000000000
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-constexpr int64_t kNoPadding = -1;
-
-void prefetchs(const std::vector<std::string>& id_var_names,
-               const std::vector<std::string>& out_var_names,
-               const std::string& persistable_var_name, const bool backfill,
-               const std::vector<std::string>& table_names,
-               const std::vector<std::string>& endpoints,
-               const framework::ExecutionContext& context,
-               const framework::Scope& scope);
-
-void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& persistable_var_name, const bool backfill,
-              const std::vector<std::string>& table_names,
-              const std::vector<std::string>& endpoints,
-              const framework::ExecutionContext& context,
-              const framework::Scope& scope);
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
deleted file mode 100644
index d5d3c9c3c7c48f..00000000000000
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sys/types.h>
-#include <algorithm>
-#include <memory>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-template <typename T>
-void RecvSparseLodTensor(const CommContext &rpc_ctx,
-                         const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-  std::vector<const float *> tensors;
-  std::vector<distributed::VarHandlePtr> rets;
-  std::vector<std::string> recv_varnames;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
-    local_scope->Var(recv_var_name);
-    // sparse param in recv_scope is LoDTensor
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(
-        rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name,
-        recv_var_name));
-    recv_varnames.push_back(recv_var_name);
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-    auto &recv_var_name = recv_varnames[i];
-    auto *local_var = local_scope->FindVar(recv_var_name);
-    const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
-    tensors.push_back(value);
-  }
-
-  auto *merged_var = scope.FindVar(rpc_ctx.var_name);
-
-  if (merged_var == nullptr || !merged_var->IsInitialized()) {
-    PADDLE_THROW(
-        platform::errors::InvalidArgument("%s must initialized at first."));
-  }
-  auto dims1 = merged_var->Get<framework::LoDTensor>().dims()[1];
-  int64_t height = 0;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]);
-    height += splited_var->Get<framework::LoDTensor>().dims()[0];
-  }
-
-  PADDLE_ENFORCE_EQ(
-      merged_var->Get<framework::LoDTensor>().dims()[0], height,
-      platform::errors::InvalidArgument(
-          "Received variable must has same dimension with local variable."));
-
-  auto *merged_t = merged_var->GetMutable<framework::LoDTensor>();
-  auto *merged_d = merged_t->mutable_data<float>(cpu_place);
-
-  auto pserver_num = rpc_ctx.splited_varnames.size();
-  for (int x = 0; x < height; ++x) {
-    auto id = x % pserver_num;
-    auto idx = x / pserver_num;
-    std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1,
-                sizeof(float) * dims1);
-  }
-}
-
-template <typename T>
-void RecvGeoSparseRecords(const CommContext &rpc_ctx,
-                          const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    local_scope->Var(recv_var_name);
-    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
-    // sparse param in recv_scope is LoDTensor
-    rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
-                                           *local_scope.get(), recv_var_name,
-                                           recv_var_name, recv_var_name));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-  }
-
-  int64_t height = 0;
-  int64_t ids_num = 0;
-  int64_t width = 0;
-
-  std::vector<int64_t> all_ids;
-  auto pserver_num = rpc_ctx.splited_varnames.size();
-
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    auto *recv_var = local_scope->FindVar(recv_var_name);
-    auto &recv_t = recv_var->Get<framework::SelectedRows>();
-
-    height += recv_t.height();
-    ids_num += recv_t.rows().size();
-    width = recv_t.value().dims()[1];
-
-    if (rpc_ctx.is_distributed) {
-      std::copy(recv_t.rows().begin(), recv_t.rows().end(),
-                std::back_inserter(all_ids));
-    } else {
-      std::transform(recv_t.rows().begin(), recv_t.rows().end(),
-                     std::back_inserter(all_ids),
-                     [&](int64_t id) { return id * pserver_num + i; });
-    }
-  }
-
-  auto *var = scope.FindVar(rpc_ctx.var_name);
-  auto *t_ = var->GetMutable<framework::SelectedRows>();
-  T *out_data =
-      t_->mutable_value()->mutable_data<T>({ids_num, width}, cpu_place);
-  t_->set_height(height);
-  t_->set_rows(all_ids);
-
-  int64_t cnt = 0;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    auto *recv_var = local_scope->FindVar(recv_var_name);
-    auto &recv_t = recv_var->Get<framework::SelectedRows>();
-
-    auto rows = recv_t.rows().size();
-    const T *in_data = recv_t.value().data<T>();
-    std::copy_n(in_data, rows * width, out_data + cnt);
-    cnt += rows * width;
-  }
-  t_->SyncIndex();
-}
-
-template <typename T>
-void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::vector<distributed::VarHandlePtr> rets;
-
-  // variable do not spilt
-  if (rpc_ctx.origin_varnames.size() == 1 &&
-      rpc_ctx.splited_varnames.size() == 1) {
-    auto varname = rpc_ctx.origin_varnames[0];
-    const auto place =
-        scope.FindVar(varname)->Get<framework::LoDTensor>().place();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(place);
-    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
-            << platform::is_gpu_place(place);
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
-                                                    scope, varname, varname));
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(
-          rets[i]->Wait(), 0U,
-          platform::errors::ExecutionTimeout("internal error in RPCClient"));
-    }
-
-    VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
-    return;
-  } else {
-    PADDLE_ENFORCE(false, platform::errors::Unimplemented(
-                              "ParameterRecv can not recv dense with multi "
-                              "parts now, add it soon."));
-  }
-}
-
-template <typename T>
-void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope,
-                                  bool geo_records) {
-  VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
-
-  PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "origin_varnames.size() >= 1 is permitted"));
-
-  if (rpc_ctx.is_sparse) {
-    if (geo_records) {
-      RecvGeoSparseRecords<T>(rpc_ctx, scope);
-    } else {
-      RecvSparseLodTensor<T>(rpc_ctx, scope);
-    }
-  } else {
-    RecvLodTensor<T>(rpc_ctx, scope);
-  }
-
-  VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
-}
-template <typename T>
-void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope) {
-  this->operator()(rpc_ctx, scope, false);
-}
-
-template struct ParameterRecv<float>;
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h
deleted file mode 100644
index c30d21aa791e23..00000000000000
--- a/paddle/fluid/operators/distributed/parameter_recv.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-template <typename T>
-struct ParameterRecv {
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope,
-                  bool barrier);
-
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
deleted file mode 100644
index 109514ca2541c3..00000000000000
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include <memory>
-#include <utility>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-typedef std::vector<std::pair<std::string, std::string>> EP_SPLIT_TABLE_PAIRS;
-
-inline EP_SPLIT_TABLE_PAIRS GetMultiFieldCommContext(
-    const CommContext &rpc_ctx, const framework::Scope &scope,
-    int multi_parts) {
-  EP_SPLIT_TABLE_PAIRS table_pairs;
-
-  auto *send_var = scope.FindVar(rpc_ctx.var_name);
-  if (send_var->IsType<framework::SelectedRows>()) {
-    PADDLE_ENFORCE_GE(multi_parts, 1,
-                      platform::errors::InvalidArgument(
-                          "multi_parts must == 1 in parameter send, now is: %d",
-                          multi_parts));
-
-    for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-      table_pairs.push_back(
-          std::make_pair(rpc_ctx.epmap[i], rpc_ctx.splited_varnames[i]));
-    }
-
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "GetMultiFieldCommContext unsupported LoDTensor current!"));
-  }
-
-  return table_pairs;
-}  // namespace distributed
-
-void SendByNotifyRPC(const CommContext &rpc_ctx,
-                     const framework::Scope &scope) {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto &send_var_name = rpc_ctx.var_name;
-  std::vector<distributed::VarHandlePtr> rets;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  if (NeedSend(scope, send_var_name)) {
-    for (size_t j = 0; j < rpc_ctx.epmap.size(); j++) {
-      auto &endpoint = rpc_ctx.epmap[j];
-      VLOG(4) << "sending " << send_var_name << " to " << endpoint;
-      rets.push_back(rpc_client->AsyncDistributeNotify(endpoint, cpu_ctx, scope,
-                                                       send_var_name));
-      VLOG(4) << "send var " << send_var_name << " by notify RPC done";
-    }
-  } else {
-    VLOG(3) << "don't send non-initialized variable: " << rpc_ctx.var_name;
-  }
-
-  for (auto &handle : rets) {
-    PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                              "internal error in RPCClient"));
-  }
-}
-
-template <typename T>
-void ParameterSend<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope, bool sync,
-                                  int multi_parts) {
-  if (rpc_ctx.var_name == STEP_COUNTER) {
-    SendByNotifyRPC(rpc_ctx, scope);
-    return;
-  }
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx = *pool.Get(platform::CPUPlace());
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::vector<distributed::VarHandlePtr> rets;
-  auto *send_var = scope.FindVar(rpc_ctx.var_name);
-
-  if (send_var->IsType<framework::LoDTensor>()) {
-    size_t out_num = rpc_ctx.splited_varnames.size();
-    if (out_num > 1) {
-      auto &send_tensor = send_var->Get<framework::LoDTensor>();
-      auto &send_tensor_dims = send_tensor.dims();
-      std::vector<framework::DDim> outs_dims;
-      outs_dims.reserve(out_num);
-
-      // infer output shape
-      PADDLE_ENFORCE_EQ(
-          rpc_ctx.height_sections.size(), out_num,
-          platform::errors::InvalidArgument("tensor split sections size"
-                                            "should be equal to output size."));
-      for (size_t i = 0; i < out_num; ++i) {
-        auto dim = send_tensor_dims;
-        dim[0] = rpc_ctx.height_sections[i];
-        outs_dims.push_back(dim);
-      }
-
-      // create output var in local scope
-      size_t row_offset = 0;
-      for (size_t i = 0; i < out_num; ++i) {
-        framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[i])
-                                     ->GetMutable<framework::LoDTensor>();
-        *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
-        row_offset += outs_dims[i][0];
-      }
-    } else {
-      auto &send_tensor = send_var->Get<framework::LoDTensor>();
-      framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[0])
-                                   ->GetMutable<framework::LoDTensor>();
-      out->ShareDataWith(send_tensor);
-    }
-
-    for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-      auto &send_var_name = rpc_ctx.splited_varnames[i];
-      auto &endpoint = rpc_ctx.epmap[i];
-      VLOG(4) << " send var name: " << send_var_name
-              << "endpoint: " << endpoint;
-      if (NeedSend(*local_scope.get(), send_var_name)) {
-        VLOG(3) << "sending " << send_var_name << " to " << endpoint;
-        rets.push_back(rpc_client->AsyncSendVar(
-            endpoint, cpu_ctx, *local_scope.get(), send_var_name));
-        VLOG(4) << "send var " << send_var_name << " async handle done";
-      } else {
-        VLOG(3) << "don't send non-initialized variable: "
-                << rpc_ctx.splited_varnames[i];
-      }
-    }
-  } else if (send_var->IsType<framework::SelectedRows>()) {
-    auto &send_slr = send_var->Get<framework::SelectedRows>();
-
-    auto &send_rows = send_slr.rows();
-    if (send_rows.size() == 0) {
-      LOG(WARNING)
-          << "WARNING: The variable sent to pserver is empty, which "
-             "may cause an unknown error. Please check the state of "
-             "use_double_buffer in pyreader/dataloader async mode, you need to "
-             "turn it false.";
-    }
-
-    std::vector<std::vector<size_t>> outs_rows_idx;
-    std::vector<std::vector<size_t>> outs_dense_idx;
-
-    auto table_pairs = GetMultiFieldCommContext(rpc_ctx, scope, 1);
-    outs_rows_idx.resize(table_pairs.size());
-    outs_dense_idx.resize(table_pairs.size());
-
-    auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0];
-    auto *src = send_slr.value().data<T>();
-
-    // create output var in local scope
-    std::vector<framework::SelectedRows *> outs;
-    for (auto &table : table_pairs) {
-      auto *out =
-          local_scope->Var(table.second)->GetMutable<framework::SelectedRows>();
-      outs.push_back(out);
-    }
-
-    if (!rpc_ctx.is_distributed) {
-      auto pserver_num = rpc_ctx.epmap.size();
-
-      // split rows index into output sparse vars
-      for (size_t i = 0; i < send_rows.size(); ++i) {
-        auto ep_idx = send_rows[i] % pserver_num;
-        auto id = send_rows[i] / pserver_num;
-        outs_rows_idx[ep_idx].push_back(id);
-        outs_dense_idx[ep_idx].push_back(i);
-      }
-
-      auto place = platform::CPUPlace();
-
-      for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size();
-           out_idx++) {
-        auto rows_idx = outs_rows_idx[out_idx];
-
-        auto dims = send_slr.GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]);
-        outs[out_idx]->mutable_rows()->clear();
-        outs[out_idx]->mutable_value()->mutable_data<T>(dims, send_slr.place());
-
-        if (rows_idx.size() > 0) {
-          for (auto idx : rows_idx) {
-            outs[out_idx]->mutable_rows()->push_back(idx);
-          }
-          auto dst = outs[out_idx]->mutable_value()->mutable_data<T>(place);
-          for (size_t j = 0; j < rows_idx.size(); j++) {
-            if (platform::is_cpu_place(place)) {
-              memory::Copy(platform::CPUPlace(), dst + j * row_numel,
-                           platform::CPUPlace(),
-                           src + outs_dense_idx[out_idx][j] * row_numel,
-                           sizeof(T) * row_numel);
-            } else {
-              PADDLE_THROW(
-                  platform::errors::Unimplemented("do not support GPU now"));
-            }
-          }
-        }
-        PADDLE_ENFORCE_EQ(
-            rows_idx.size(), outs[out_idx]->rows().size(),
-            platform::errors::InvalidArgument(
-                "rows should has the same size with tensor dim 0"));
-      }
-    } else {
-      auto pserver_num = rpc_ctx.epmap.size();
-
-      // split rows index into output sparse vars
-      for (size_t i = 0; i < send_rows.size(); ++i) {
-        auto out_idx = send_rows[i] % pserver_num;
-        outs_rows_idx[out_idx].push_back(send_rows[i]);
-        outs_dense_idx[out_idx].push_back(i);
-      }
-
-      auto place = platform::CPUPlace();
-
-      for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size();
-           out_idx++) {
-        auto rows_idx = outs_rows_idx[out_idx];
-
-        auto dims = send_slr.GetCompleteDims();
-        dims[0] = rows_idx.size();
-
-        outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]);
-        outs[out_idx]->mutable_rows()->clear();
-        outs[out_idx]->mutable_value()->mutable_data<T>(dims, send_slr.place());
-
-        if (rows_idx.size() > 0) {
-          for (auto idx : rows_idx) {
-            outs[out_idx]->mutable_rows()->push_back(idx);
-          }
-          auto dst = outs[out_idx]->mutable_value()->mutable_data<T>(place);
-          for (size_t j = 0; j < rows_idx.size(); j++) {
-            if (platform::is_cpu_place(place)) {
-              memory::Copy(platform::CPUPlace(), dst + j * row_numel,
-                           platform::CPUPlace(),
-                           src + outs_dense_idx[out_idx][j] * row_numel,
-                           sizeof(T) * row_numel);
-            } else {
-              PADDLE_THROW(
-                  platform::errors::Unimplemented("do not support GPU now"));
-            }
-          }
-        }
-        PADDLE_ENFORCE_EQ(
-            rows_idx.size(), outs[out_idx]->rows().size(),
-            platform::errors::InvalidArgument(
-                "rows should has the same size with tensor dim 0"));
-      }
-    }
-
-    for (size_t i = 0; i < table_pairs.size(); i++) {
-      auto &send_var_name = table_pairs[i].second;
-      auto &endpoint = table_pairs[i].first;
-      auto need_send = NeedSend(*local_scope.get(), send_var_name);
-
-      VLOG(4) << "send var name: " << send_var_name
-              << " send var endpoint: " << endpoint
-              << " need send: " << need_send;
-
-      if (need_send) {
-        VLOG(4) << "sending " << send_var_name << " to " << endpoint;
-
-        rets.push_back(rpc_client->AsyncSendVar(
-            endpoint, cpu_ctx, *local_scope.get(), send_var_name));
-        VLOG(4) << "send var " << send_var_name << " async handle done";
-      } else {
-        VLOG(4) << "don't send non-initialized variable: "
-                << rpc_ctx.splited_varnames[i];
-      }
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "unsupported var type: %s to send!", send_var->Type()));
-  }
-
-  VLOG(4) << "Prepare to send var " << rpc_ctx.var_name;
-  if (sync) {
-    for (auto &handle : rets) {
-      VLOG(4) << "Wait send var to pserver handle: " << handle;
-      PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                                "internal error in RPCClient"));
-    }
-  }
-}
-
-template struct ParameterSend<float>;
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
deleted file mode 100644
index cedc98b1fcadd4..00000000000000
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <string>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-char* EncodeVarint32(char* dst, uint32_t v) {
-  // Operate on characters as unsigneds
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  static const int B = 128;
-  if (v < (1 << 7)) {
-    *(ptr++) = v;
-  } else if (v < (1 << 14)) {
-    *(ptr++) = v | B;
-    *(ptr++) = v >> 7;
-  } else if (v < (1 << 21)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = v >> 14;
-  } else if (v < (1 << 28)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = v >> 21;
-  } else {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = (v >> 21) | B;
-    *(ptr++) = v >> 28;
-  }
-  return reinterpret_cast<char*>(ptr);
-}
-
-char* EncodeVarint64(char* dst, uint64_t v) {
-  static const int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  while (v >= B) {
-    *(ptr++) = (v & (B - 1)) | B;
-    v >>= 7;
-  }
-  *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
-}
-
-int VarintLength(uint64_t v) {
-  int len = 1;
-  while (v >= 128) {
-    v >>= 7;
-    len++;
-  }
-  return len;
-}
-
-class ProtoEncodeHelper {
- public:
-  ProtoEncodeHelper(char* buf, int max_size)
-      : base_(buf), p_(buf), limit_(base_ + max_size) {}
-
-  ~ProtoEncodeHelper() {}
-
-  const char* data() const { return base_; }
-  size_t size() const { return p_ - base_; }
-
-  void WriteUint64(int tag, uint64_t v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    Encode64(v);
-  }
-  void WriteBool(int tag, bool v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    EncodeBool(v);
-  }
-  void WriteString(int tag, const std::string& v) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(v.size());
-    EncodeBytes(v.data(), v.size());
-  }
-  void WriteVarlengthBeginning(int tag, uint32_t len) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(len);
-  }
-  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
-
- private:
-  // Note: this module's behavior must match the protocol buffer wire encoding
-  // format.
-  enum {
-    WIRETYPE_VARINT = 0,
-    WIRETYPE_LENGTH_DELIMITED = 2,
-  };
-  static uint32_t combine(uint32_t tag, uint32_t type) {
-    return ((tag << 3) | type);
-  }
-  inline void Encode32(uint32_t v) {
-    if (v < 128) {
-      // Fast path for single-byte values.  Many of the calls will use a
-      // constant value for v, so the comparison will get optimized away
-      // when Encode32 is inlined into the caller.
-      *p_ = v;
-      p_++;
-    } else {
-      p_ = EncodeVarint32(p_, v);
-    }
-  }
-  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
-  void EncodeBool(bool v) {
-    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
-    p_++;
-  }
-  void EncodeBytes(const char* bytes, int N) {
-    memcpy(p_, bytes, N);
-    p_ += N;
-  }
-
-  char* base_;
-  char* p_;
-  char* limit_;  // Just for CHECKs
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
deleted file mode 100644
index 44359af1b1b2a6..00000000000000
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ /dev/null
@@ -1,261 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-#include <condition_variable>  // NOLINT
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-constexpr char kRequestSend[] = "RequestSend";
-constexpr char kRequestGet[] = "RequestGet";
-constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable";
-constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
-constexpr char kRequestPrefetch[] = "RequestPrefetch";
-constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
-constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
-constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
-constexpr char kRequestNotify[] = "RequestNotify";
-constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";
-
-constexpr char kSendRPC[] = "SendRPC";
-constexpr char kGetRPC[] = "GetRPC";
-constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC";
-constexpr char kGetMonomerRPC[] = "GetMonomerRPC";
-constexpr char kPrefetchRPC[] = "PrefetchRPC";
-constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC";
-constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
-constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
-constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
-constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
-constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
-constexpr int64_t kPrefetchTimeout = 60000;
-
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
-#define COMPLETE_MESSAGE "COMPLETE@RECV"
-#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
-#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@"
-#define STEP_COUNTER "@PS_STEP_COUNTER@"
-
-#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
-#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
-
-enum DistributedMode { kSync = 0, kAsync = 1, kHalfAsync = 2, kGeo = 3 };
-
-class RPCServer;
-
-class VarHandle {
- public:
-  VarHandle(const std::string ep, const std::string& method,
-            const std::string& name,
-            const platform::DeviceContext* p_ctx = nullptr,
-            const framework::Scope* p_scope = nullptr)
-      : status_(kDefaultState) {
-    ep_ = ep;
-    ctx_ = p_ctx;
-    scope_ = p_scope;
-    name_ = name;
-    method_ = method;
-  }
-
-  virtual ~VarHandle() {}
-
- public:
-  bool should_retry = false;
-
-  bool Wait() {
-    int ret = kDefaultState;
-    {
-      std::unique_lock<std::mutex> lk(sync_mutex_);
-      wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
-      ret = status_;
-    }
-    VLOG(7) << "VarHandle wait:" << ret;
-    return ret != kErrorState;
-  }
-
-  void Finish(bool ok) {
-    {
-      std::unique_lock<std::mutex> lk(sync_mutex_);
-      status_ = ok ? kFinishState : kErrorState;
-    }
-    VLOG(7) << "VarHandle finish:" << ok;
-    wait_cond_.notify_all();
-  }
-
-  std::string String() const {
-    std::ostringstream s;
-    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:["
-      << status_ << "]";
-    return s.str();
-  }
-
-  std::string ep() const { return ep_; }
-  const platform::DeviceContext* ctx() const { return ctx_; }
-  const framework::Scope* scope() const { return scope_; }
-  std::string name() const { return name_; }
-  std::string method() const { return method_; }
-
- protected:
-  // RPC endpoint.
-  std::string ep_;
-  const platform::DeviceContext* ctx_;
-  const framework::Scope* scope_;
-  // Variable name.
-  std::string name_;
-  // RPC method name.
-  std::string method_;
-
- protected:
-  std::mutex sync_mutex_;
-  std::condition_variable wait_cond_;
-
-  enum VarHandleStatus {
-    kDefaultState = -1,
-    kErrorState = 0,
-    kFinishState = 1,
-  };
-  VarHandleStatus status_;
-
- private:
-  DISABLE_COPY_AND_ASSIGN(VarHandle);
-};
-
-typedef std::shared_ptr<VarHandle> VarHandlePtr;
-
-class RequestHandler {
- public:
-  explicit RequestHandler(int distributed_mode)
-      : distributed_mode_(distributed_mode),
-        dev_ctx_(nullptr),
-        executor_(nullptr),
-        scope_(nullptr),
-        program_(nullptr),
-        rpc_server_(nullptr) {}
-
-  virtual ~RequestHandler() {}
-
-  // Set attributes.
-  void SetScope(framework::Scope* scope) { scope_ = scope; }
-  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
-  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
-  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
-
-  // Used for dist lookup table prefetch
-  void SetPrefetchPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    prefetch_var_name_to_prepared_ctx_ = g;
-  }
-
-  void SetCheckpointNotifyPreparedCtx(
-      std::shared_ptr<framework::ExecutorPrepareContext> g) {
-    checkpoint_prepared_ctx_ = g;
-  }
-
-  // Used for async.
-  void SetGradToPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    grad_to_prepared_ctx_ = g;
-  }
-
-  void SetSparseGradToParam(std::unordered_map<std::string, std::string>* g) {
-    sparse_grad_to_param_ = g;
-  }
-
-  void SetLrDecayPreparedCtx(
-      std::shared_ptr<framework::ExecutorPrepareContext> g) {
-    lr_decay_prepared_ctx_ = g;
-  }
-
-  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
-
-  // Get attributes.
-  int distributed_mode() { return distributed_mode_; }
-  framework::Scope* scope() { return scope_; }
-  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ProgramDesc* program() { return program_; }
-  framework::Executor* executor() { return executor_; }
-
-  // This function processes user's rpc request.
-  // The implemention is in request_handler_impl.
-  // example:
-  //    std::string varname = request_.varname();
-  //
-  //    auto scope = request_handler_->scope();
-  //    auto invar = scope->FindVar(varname);
-  //    framework::Variable* outvar = nullptr;
-  //
-  //    request_handler_->Handle(varname, scope, invar, &outvar);
-  //    if (outvar) {
-  //        SerializeToByteBuffer(varname, outvar,
-  //           *request_handler_->dev_ctx(), &reply_);
-  //    }
-  virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var, framework::Variable** outvar,
-                      const int trainer_id,
-                      const std::string& out_var_name = "",
-                      const std::string& table_name = "") = 0;
-
- protected:
-  const int distributed_mode_;
-
-  const platform::DeviceContext* dev_ctx_;
-  framework::Executor* executor_;
-  framework::Scope* scope_;
-  framework::ProgramDesc* program_;
-
-  // used for distribute lookup table prefetch
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      prefetch_var_name_to_prepared_ctx_;
-  // used for checkpoint notify
-  std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_prepared_ctx_;
-
-  // Used for async.
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      grad_to_prepared_ctx_;
-  std::unordered_map<std::string, std::string>* sparse_grad_to_param_;
-
-  // used for lr decay
-  std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_prepared_ctx_;
-  RPCServer* rpc_server_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
deleted file mode 100644
index 8c4f2ef57a32c8..00000000000000
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ /dev/null
@@ -1,354 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/string/piece.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/split.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
-// to directory specified.
-constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
-
-bool RequestSendHandler::Handle(const std::string &varname,
-                                framework::Scope *scope,
-                                framework::Variable *invar,
-                                framework::Variable **outvar,
-                                const int trainer_id,
-                                const std::string &out_var_name,
-                                const std::string &table_name) {
-  VLOG(4) << "RequestSendHandler:" << varname;
-
-  // Sync
-  if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
-    rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
-
-    if (HeartBeatMonitor::GetInstance() != nullptr) {
-      HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED);
-    }
-
-    rpc_server_->Complete();
-  } else {
-    // Async
-    if (distributed_mode_ != DistributedMode::kSync) {
-      VLOG(3) << "async process var: " << varname;
-      if (varname == BATCH_BARRIER_MESSAGE) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "async mode should not recv BATCH_BARRIER_MESSAGE or "
-            "COMPLETE_MESSAGE"));
-      }
-      HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING);
-
-      std::string run_varname = varname;
-
-      string::Piece part_piece("@PIECE");
-      string::Piece var_name_piece = string::Piece(varname);
-
-      if (string::Contains(var_name_piece, part_piece)) {
-        auto varname_splits = paddle::string::Split(varname, '@');
-        PADDLE_ENFORCE_EQ(
-            varname_splits.size(), 3,
-            platform::errors::InvalidArgument(
-                "varname: %s should be separated into 3 parts by @", varname));
-        run_varname = varname_splits[0];
-        scope->Rename(varname, run_varname);
-      }
-
-      auto *var = scope->FindVar(run_varname);
-
-      // for sparse ids
-      if (var->IsType<framework::SelectedRows>()) {
-        if (distributed_mode_ == DistributedMode::kAsync ||
-            distributed_mode_ == DistributedMode::kHalfAsync) {
-          auto *ins = distributed::LargeScaleKV::GetInstance();
-          if (ins->GradInLargeScale(run_varname)) {
-            auto *large_scale_var = ins->GetByGrad(run_varname);
-
-            for (auto name : large_scale_var->CachedVarnames()) {
-              scope->Var(name);
-            }
-          }
-        }
-        if (distributed_mode_ == DistributedMode::kGeo) {
-          if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(
-                  run_varname)) {
-            auto &grad_slr =
-                scope->FindVar(run_varname)->Get<framework::SelectedRows>();
-            AsyncSparseParamUpdateRecorder::GetInstance()->Update(
-                run_varname, grad_slr.rows());
-          }
-        }
-      }
-
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(),
-                                    scope);
-      return true;
-    } else {  // sync
-      rpc_server_->WaitCond(kRequestSend);
-      VLOG(3) << "sync: processing received var: " << varname;
-      PADDLE_ENFORCE_NOT_NULL(
-          invar, platform::errors::NotFound(
-                     "sync: Can not find server side var %s.", varname));
-    }
-  }
-  return true;
-}
-
-bool RequestGetHandler::Handle(const std::string &varname,
-                               framework::Scope *scope,
-                               framework::Variable *invar,
-                               framework::Variable **outvar,
-                               const int trainer_id,
-                               const std::string &out_var_name,
-                               const std::string &table_name) {
-  VLOG(3) << "RequestGetHandler:" << varname
-          << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
-          << " table_name: " << table_name;
-
-  if (distributed_mode_ == DistributedMode::kSync) {
-    if (varname == FETCH_BARRIER_MESSAGE) {
-      VLOG(3) << "sync: recv fetch barrier message";
-      rpc_server_->IncreaseBatchBarrier(kRequestGet);
-    } else {
-      rpc_server_->WaitCond(kRequestGet);
-      *outvar = scope_->FindVar(varname);
-    }
-  } else {
-    if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
-      if (enable_dc_asgd_) {
-        // NOTE: the format is determined by distribute_transpiler.py
-        std::string param_bak_name =
-            string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
-        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
-        auto var = scope_->FindVar(varname);
-        auto t_orig = var->Get<framework::LoDTensor>();
-        auto param_bak = scope_->Var(param_bak_name);
-        auto t = param_bak->GetMutable<framework::LoDTensor>();
-        t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
-        VLOG(3) << "copying " << varname << " to " << param_bak_name;
-        framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
-      }
-
-      if (distributed_mode_ == DistributedMode::kGeo &&
-          AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
-          !table_name.empty()) {
-        VLOG(3) << "AsyncSparseParamUpdateRecorder " << varname << " exist ";
-
-        std::vector<int64_t> updated_rows;
-        AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
-            varname, trainer_id, &updated_rows);
-
-        if (VLOG_IS_ON(3)) {
-          std::ostringstream sstream;
-          sstream << "[";
-          for (auto &row_id : updated_rows) {
-            sstream << row_id << ", ";
-          }
-          sstream << "]";
-          VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
-                  << sstream.str();
-        }
-
-        auto &origin_tensor =
-            scope_->FindVar(varname)->Get<framework::LoDTensor>();
-        auto *origin_tensor_data = origin_tensor.data<float>();
-        auto &dims = origin_tensor.dims();
-        *outvar = scope->Var();
-        auto *out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
-        out_slr->set_rows(updated_rows);
-        out_slr->set_height(dims[0]);
-        auto out_dims = framework::make_ddim(
-            {static_cast<int64_t>(updated_rows.size()), dims[1]});
-        auto *data = out_slr->mutable_value()->mutable_data<float>(
-            out_dims, origin_tensor.place());
-        auto width = dims[1];
-        for (size_t i = 0; i < updated_rows.size(); ++i) {
-          PADDLE_ENFORCE_LT(
-              updated_rows[i], dims[0],
-              platform::errors::OutOfRange(
-                  "The value of updated_rows: %s out of Tensor %s dims[0]: %s",
-                  updated_rows[i], varname, dims[0]));
-          memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
-                 sizeof(float) * width);
-        }
-      } else {
-        *outvar = scope_->FindVar(varname);
-      }
-    }
-  }
-  return true;
-}
-
-bool RequestGetNoBarrierHandler::Handle(const std::string &varname,
-                                        framework::Scope *scope,
-                                        framework::Variable *invar,
-                                        framework::Variable **outvar,
-                                        const int trainer_id,
-                                        const std::string &out_var_name,
-                                        const std::string &table_name) {
-  VLOG(4) << "RequestGetNoBarrierHandler:" << varname
-          << " out_var_name: " << out_var_name;
-
-  // get var from pserver immediately without barriers
-  string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE);
-  string::Piece var_name_piece = string::Piece(varname);
-
-  if (string::Contains(var_name_piece, without_barrier_piece)) {
-    var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece);
-    VLOG(4) << "Get var " << var_name_piece << " with "
-            << WITHOUT_BARRIER_MESSAGE;
-    *outvar = scope_->FindVar(var_name_piece.ToString());
-    return true;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE));
-  }
-  return true;
-}
-
-bool RequestPrefetchHandler::Handle(const std::string &varname,
-                                    framework::Scope *scope,
-                                    framework::Variable *invar,
-                                    framework::Variable **outvar,
-                                    const int trainer_id,
-                                    const std::string &out_var_name,
-                                    const std::string &table_name) {
-  VLOG(4) << "RequestPrefetchHandler " << varname;
-
-  (*outvar)->GetMutable<framework::LoDTensor>();
-
-  VLOG(1) << "Prefetch "
-          << "tablename: " << table_name << " ids:" << varname
-          << " out: " << out_var_name;
-  paddle::platform::CPUPlace cpu_place;
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-
-  if (ins->ParamInLargeScale(table_name)) {
-    auto lookup_table_op = PullLargeScaleOp(table_name, varname, out_var_name);
-    lookup_table_op->Run(*scope, cpu_place);
-  } else {
-    auto lookup_table_op =
-        BuildLookupTableOp(table_name, varname, out_var_name);
-    lookup_table_op->Run(*scope, cpu_place);
-  }
-
-  return true;
-}
-
-bool RequestCheckpointHandler::Handle(const std::string &varname,
-                                      framework::Scope *scope,
-                                      framework::Variable *invar,
-                                      framework::Variable **outvar,
-                                      const int trainer_id,
-                                      const std::string &out_var_name,
-                                      const std::string &table_name) {
-  VLOG(4) << "receive save var " << varname << " with path " << out_var_name
-          << " mode " << table_name;
-
-  int mode = std::stoi(table_name);
-
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Save(out_var_name, mode);
-  return true;
-}
-
-bool RequestNotifyHandler::Handle(const std::string &varname,
-                                  framework::Scope *scope,
-                                  framework::Variable *invar,
-                                  framework::Variable **outvar,
-                                  const int trainer_id,
-                                  const std::string &out_var_name,
-                                  const std::string &table_name) {
-  VLOG(3) << "RequestNotifyHandler: " << varname
-          << ", trainer_id: " << trainer_id;
-
-  string::Piece decay_piece(STEP_COUNTER);
-  string::Piece var_name_piece = string::Piece(varname);
-  if (string::Contains(var_name_piece, decay_piece)) {
-    VLOG(3) << "LearningRate Decay Counter Update";
-
-    auto *send_var = scope->FindVar(varname);
-    auto send_var_tensor = send_var->Get<framework::LoDTensor>();
-    auto *send_value =
-        send_var_tensor.mutable_data<int64_t>(send_var_tensor.place());
-
-    auto counter = decay_counters.at(trainer_id);
-    counter += send_value[0];
-    decay_counters.at(trainer_id) = counter;
-
-    auto *global_step_var = this->scope()->FindVar(LEARNING_RATE_DECAY_COUNTER);
-    if (global_step_var == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not find LEARNING_RATE_DECAY_COUNTER "));
-    }
-
-    auto *tensor = global_step_var->GetMutable<framework::LoDTensor>();
-    auto *value = tensor->mutable_data<int64_t>(platform::CPUPlace());
-
-    auto global_counter = 0;
-    for (auto &trainer_counter : decay_counters) {
-      global_counter += trainer_counter.second;
-    }
-    value[0] = global_counter;
-
-    if (lr_decay_prepared_ctx_.get() == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not find decay block for executor"));
-    }
-
-    executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_);
-  }
-  return true;
-}
-
-bool RequestSendAndRecvHandler::Handle(const std::string &varname,
-                                       framework::Scope *Scope,
-                                       framework::Variable *var,
-                                       framework::Variable **outvar,
-                                       const int trainer_id,
-                                       const std::string &out_var_name,
-                                       const std::string &table_name) {
-  VLOG(3) << "SendAndRecvHandle: " << varname
-          << " out_var_name: " << out_var_name
-          << " , trainer_id:  " << trainer_id;
-
-  executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
-  *outvar = Scope->FindVar(out_var_name);
-  return true;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
deleted file mode 100644
index 6d239673f91041..00000000000000
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestSendHandler final : public RequestHandler {
- public:
-  explicit RequestSendHandler(int distributed_mode, bool enable_dc_asgd = false)
-      : RequestHandler(distributed_mode) {
-    enable_dc_asgd_ = enable_dc_asgd;
-  }
-  virtual ~RequestSendHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  bool enable_dc_asgd_;
-};
-
-class RequestGetHandler final : public RequestHandler {
- public:
-  explicit RequestGetHandler(int distributed_mode, bool enable_dc_asgd = false)
-      : RequestHandler(distributed_mode) {
-    enable_dc_asgd_ = enable_dc_asgd;
-  }
-  virtual ~RequestGetHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  bool enable_dc_asgd_;
-};
-
-class RequestGetNoBarrierHandler final : public RequestHandler {
- public:
-  RequestGetNoBarrierHandler() : RequestHandler(false) {}
-  virtual ~RequestGetNoBarrierHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-};
-
-static inline void BuildVar(const std::string& param_name,
-                            std::initializer_list<const char*> arguments,
-                            paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    *var->mutable_arguments()->Add() = arg_name;
-  }
-}
-
-class RequestPrefetchHandler final : public RequestHandler {
- public:
-  explicit RequestPrefetchHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-  virtual ~RequestPrefetchHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  std::unique_ptr<paddle::framework::OperatorBase> PullLargeScaleOp(
-      const std::string& table_name, const std::string& id_name,
-      const std::string& out_name) {
-    framework::OpDesc desc;
-    desc.SetType("lookup_sparse_table_read");
-    desc.SetInput("Ids", {id_name});
-    desc.SetOutput("Out", std::vector<std::string>({out_name}));
-    desc.SetAttr("tablename", {table_name});
-    desc.SetAttr("init", true);
-    desc.SetAttr("value_names", std::vector<std::string>({"Param"}));
-
-    auto op = paddle::framework::OpRegistry::CreateOp(desc);
-    return op;
-  }
-
-  std::unique_ptr<paddle::framework::OperatorBase> BuildLookupTableOp(
-      const std::string& table_name, const std::string& id_name,
-      const std::string& out_name) {
-    paddle::framework::proto::OpDesc op_desc;
-    op_desc.set_type("lookup_table");
-    BuildVar("W", {table_name.data()}, op_desc.add_inputs());
-    BuildVar("Ids", {id_name.data()}, op_desc.add_inputs());
-    BuildVar("Out", {out_name.data()}, op_desc.add_outputs());
-
-    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    return op;
-  }
-};
-
-class RequestCheckpointHandler final : public RequestHandler {
- public:
-  explicit RequestCheckpointHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-
-  virtual ~RequestCheckpointHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  std::unique_ptr<paddle::framework::OperatorBase> BuildCheckpointOp(
-      const std::string& varname, const std::string& file_path) {
-    paddle::framework::proto::OpDesc op_desc;
-    op_desc.set_type("save");
-    BuildVar("X", {varname.data()}, op_desc.add_inputs());
-
-    auto attr = op_desc.mutable_attrs()->Add();
-    attr->set_name("file_path");
-    attr->set_type(paddle::framework::proto::AttrType::STRING);
-    attr->set_s(file_path);
-
-    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    return op;
-  }
-};
-
-class RequestNotifyHandler final : public RequestHandler {
- public:
-  explicit RequestNotifyHandler(int distributed_mode, int trainers)
-      : RequestHandler(distributed_mode) {
-    this->trainers = trainers;
-    for (int i = 0; i < trainers; i++) {
-      decay_counters[i] = 0;
-    }
-  }
-  virtual ~RequestNotifyHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  int trainers;
-  std::unordered_map<int, int64_t> decay_counters;
-};
-
-class RequestSendAndRecvHandler final : public RequestHandler {
- public:
-  explicit RequestSendAndRecvHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-  virtual ~RequestSendAndRecvHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* Scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
deleted file mode 100644
index 57ce54870decf2..00000000000000
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "gflags/gflags.h"
-
-// default to 3min to avoid temprary network failures.
-DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
-DEFINE_int32(rpc_retry_times, 3, "retry times for rpc");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag RPCClient::init_flag_;
-std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
-int RPCClient::trainer_id_ = 0;
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
deleted file mode 100644
index 2c756a6f71ff94..00000000000000
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <string>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_int32(rpc_deadline);
-DECLARE_int32(rpc_retry_times);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient {
- public:
-  RPCClient() {}
-  virtual ~RPCClient() {}
-  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
-                                    const platform::DeviceContext& ctx,
-                                    const framework::Scope& scope,
-                                    const std::string& var_name,
-                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope& scope,
-                                   const std::string& var_name,
-                                   const std::string& out_varname,
-                                   const std::string& table_name = "",
-                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetVarNoBarrier(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_varname,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncPrefetchVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& in_var_name,
-      const std::string& out_var_name, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncDistributeNotify(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendAndRecv(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& send_var_name,
-      const std::string& recv_var_name, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendComplete(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  // Complete tells all the pserver instances that finishe the training,
-  // the pserver can reduce it's barrier count, and continue to train
-  // with other trainers.
-  virtual void SendComplete() = 0;
-
-  virtual bool Wait() = 0;
-
-  template <typename T>
-  static RPCClient* GetInstance(int trainer_id) {
-    std::call_once(init_flag_, &RPCClient::Init<T>, trainer_id);
-    return rpc_client_.get();
-  }
-
-  // Init is called by GetInstance.
-  template <typename T>
-  static void Init(int trainer_id) {
-    VLOG(1) << "init rpc client with trainer_id " << trainer_id;
-    trainer_id_ = trainer_id;
-    if (rpc_client_.get() == nullptr) {
-      rpc_client_.reset(new T());
-      rpc_client_->InitImpl();
-    }
-  }
-
-  virtual void InitImpl() {}
-
- protected:
-  // each trainer have exact one trainer id, it should be static
-  static int trainer_id_;
-
- private:
-  static std::once_flag init_flag_;
-  static std::unique_ptr<RPCClient> rpc_client_;
-};
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
deleted file mode 100644
index 37cf0460fb1fa1..00000000000000
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-#include <fstream>
-#include <string>
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestHandler;
-
-void RPCServer::ShutDown() {
-  VLOG(3) << "RPCServer ShutDown ";
-  ShutDownImpl();
-
-  exit_flag_ = true;
-  barrier_cond_.notify_all();
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::SavePort() const {
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_;
-  port_file.close();
-  VLOG(3) << "selected port written to " << file_path;
-}
-
-void RPCServer::WaitBarrier(const std::string& rpc_name) {
-  VLOG(3) << "WaitBarrier in: " << rpc_name;
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  barrier_cond_.wait(lock, [this, &rpc_name] {
-    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
-            exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitBarrier out: " << rpc_name
-          << " counter: " << barrier_counter_[rpc_name];
-}
-
-void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
-  // barrier msg should make sure that it's in the right cond(send|recv)
-  WaitCond(rpc_name);
-  int b = 0;
-  std::unique_lock<std::mutex> lock(mutex_);
-  b = ++barrier_counter_[rpc_name];
-  VLOG(3) << rpc_name << " barrier_counter: " << b;
-  if (b >= client_num_) {
-    lock.unlock();
-    VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
-            << rpc_name;
-    barrier_cond_.notify_all();
-    lock.lock();
-  }
-}
-
-void RPCServer::Complete() {
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    client_num_--;
-    need_reset_all_vars_ = true;
-
-    VLOG(3) << "decrease client_num to: " << client_num_;
-    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
-      barrier_counter_[kRequestGet]--;
-    }
-  }
-  barrier_cond_.notify_all();
-}
-
-bool RPCServer::NeedResetAllVars() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return need_reset_all_vars_;
-}
-
-int RPCServer::GetClientNum() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return client_num_;
-}
-
-void RPCServer::ResetBarrierCounter() {
-  VLOG(3) << "RPCServer ResetBarrierCounter ";
-  std::unique_lock<std::mutex> lock(mutex_);
-  for (auto& t : barrier_counter_) {
-    t.second = 0;
-  }
-  need_reset_all_vars_ = false;
-}
-
-void RPCServer::RegisterRPC(const std::string& rpc_name,
-                            RequestHandler* handler, int thread_num) {
-  rpc_call_map_[rpc_name] = handler;
-  rpc_thread_num_[rpc_name] = thread_num;
-
-  static int cond = -1;
-  rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler
-          << ", cond: " << rpc_cond_map_[rpc_name];
-}
-
-void RPCServer::SetCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer SetCond " << rpc_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cur_cond_ = rpc_cond_map_[rpc_name];
-  }
-
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond in " << rpc_name;
-  int cond = 0;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond = rpc_cond_map_[rpc_name];
-  }
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(
-      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
-  VLOG(3) << "RPCServer WaitCond out " << rpc_name;
-}
-
-void RPCServer::RegisterVar(const std::string& var_name,
-                            const std::string& rpc_name,
-                            framework::Scope* scope,
-                            platform::DeviceContext* dev_ctx) {
-  MonomerHandle h;
-  h.var_name_ = var_name;
-  h.rpc_name_ = rpc_name;
-  h.scope_ = scope;
-  h.dev_ctx_ = dev_ctx;
-
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    PADDLE_ENFORCE_EQ(
-        var_map_.find(var_name), var_map_.end(),
-        platform::errors::AlreadyExists("%s already in var_map.", var_name));
-    var_map_[var_name] = h;
-  }
-
-  rpc_cond_.notify_all();
-  VLOG(3) << "RegisterVar context:" << h.String();
-}
-
-void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
-  int b = 0;
-  MonomerHandle h;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    b = ++var_map_[var_name].barrier_;
-    h = var_map_[var_name];
-  }
-
-  if (b >= client_num_) {
-    barrier_cond_.notify_all();
-  }
-
-  VLOG(3) << "IncreaseVarBarrier context:" << h.String();
-}
-
-void RPCServer::WaitVarBarrier(const std::string& var_name) {
-  VLOG(3) << "WaitVarBarrier var_name:" << var_name;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  barrier_cond_.wait(lock, [&]() {
-    return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
-            exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
-}
-
-void RPCServer::SetVarCond(const std::string& var_name) {
-  VLOG(3) << "SetVarCond var_name:" << var_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (var_map_.find(var_name) != var_map_.end()) {
-      rpc_cond_.notify_all();
-    }
-  }
-}
-
-void RPCServer::WaitVarCond(const std::string& var_name) {
-  VLOG(3) << "WaitVarCond var_name:" << var_name;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(lock, [=] {
-    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
-}
-
-MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
-  MonomerHandle h;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    h = var_map_[var_name];
-  }
-
-  return h;
-}
-
-void RPCServer::ClearRegisteredVars() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  var_map_.clear();
-}
-
-void RPCServer::ClearVar(const std::string& var_name) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  var_map_.erase(var_name);
-}
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
deleted file mode 100644
index 2120260515e255..00000000000000
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestHandler;
-
-struct MonomerHandle {
-  std::string var_name_;
-  std::string rpc_name_;
-  framework::Scope* scope_{nullptr};
-  platform::DeviceContext* dev_ctx_{nullptr};
-  int64_t barrier_{0};
-
-  std::string String() {
-    std::stringstream ss;
-    ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_
-       << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_
-       << ", barrier_:" << barrier_;
-    return ss.str();
-  }
-};
-
-class RPCServer {
- public:
-  explicit RPCServer(const std::string& address, int client_num)
-      : cur_cond_(0),
-        bind_address_(address),
-        exit_flag_(false),
-        selected_port_(0),
-        client_num_(client_num),
-        need_reset_all_vars_(false) {}
-
-  virtual ~RPCServer() {}
-  virtual void StartServer() = 0;
-  virtual void WaitServerReady() = 0;
-
-  void ShutDown();
-
-  bool IsExit() { return exit_flag_.load(); }
-
-  int GetSelectedPort() const { return selected_port_; }
-
-  int GetClientNum();
-
-  void SavePort() const;
-
-  // RegisterRPC, register the rpc method name to a handler
-  // class, and auto generate a condition id for this call
-  // to be used for the barrier.
-  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
-                   int thread_num = 1);
-
-  int GetThreadNum(const std::string& rpc_name) {
-    return rpc_thread_num_[rpc_name];
-  }
-
-  // Wait util all the clients have reached the barrier for one
-  // rpc method. This function should be called in the
-  // RequestHandler if you want to run the server/client in a
-  // synchronous mode.
-  void WaitBarrier(const std::string& rpc_name);
-
-  void SetCond(const std::string& rpc_name);
-  void WaitCond(const std::string& rpc_name);
-  void IncreaseBatchBarrier(const std::string rpc_name);
-
-  void RegisterVar(const std::string& var_name, const std::string& rpc_name,
-                   framework::Scope* scope, platform::DeviceContext* dev_ctx);
-  void IncreaseVarBarrier(const std::string& var_name);
-  void WaitVarBarrier(const std::string& var_name);
-  void SetVarCond(const std::string& var_name);
-  void WaitVarCond(const std::string& var_name);
-  void ClearRegisteredVars();
-  void ClearVar(const std::string& var_name);
-  MonomerHandle GetMonomer(const std::string& var_name);
-
-  void Complete();
-
-  void ResetBarrierCounter();
-
-  bool NeedResetAllVars();
-
- protected:
-  virtual void ShutDownImpl() = 0;
-
- private:
-  std::mutex mutex_;
-  std::unordered_map<std::string, int> barrier_counter_;
-  std::condition_variable barrier_cond_;
-
-  std::unordered_map<std::string, int> rpc_cond_map_;
-  std::atomic<int> cur_cond_;
-  std::condition_variable rpc_cond_;
-
- protected:
-  std::string bind_address_;
-  std::atomic<int> exit_flag_;
-  int selected_port_;
-  int client_num_;
-  bool need_reset_all_vars_;
-
-  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
-  std::unordered_map<std::string, int> rpc_thread_num_;
-  friend class RequestHandler;
-
-  // TODO(gongwb): use more cond to notify or wait;
-  std::unordered_map<std::string, MonomerHandle> var_map_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
deleted file mode 100644
index f59285400033df..00000000000000
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <chrono>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace distributed = paddle::operators::distributed;
-
-USE_NO_KERNEL_OP(lookup_sparse_table_read);
-USE_NO_KERNEL_OP(checkpoint_notify);
-USE_OP(scale);
-
-std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<distributed::RequestHandler> g_req_handler;
-
-framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
-  auto root_block = program->MutableBlock(0);
-  auto* block = program->AppendBlock(*root_block);
-
-  framework::OpDesc* op = block->AppendOp();
-  op->SetType("scale");
-  op->SetInput("X", {"x"});
-  op->SetOutput("Out", {"res"});
-  op->SetAttr("scale", 0.5f);
-
-  auto& out = *root_block->Var("res");
-  out.SetType(framework::proto::VarType::LOD_TENSOR);
-  out.SetShape({1, 10});
-
-  return block;
-}
-
-void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
-  auto w_var = scope->Var("w");
-  w_var->GetMutable<framework::SelectedRows>();
-
-  auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::LoDTensor>();
-
-  auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::LoDTensor>();
-
-  auto x_var = scope->Var("x");
-  x_var->GetMutable<framework::LoDTensor>();
-
-  auto res_var = scope->Var("res");
-  res_var->GetMutable<framework::LoDTensor>();
-}
-
-void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
-  int64_t* ids_ptr =
-      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
-  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
-
-  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
-  float* x_ptr =
-      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
-  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
-}
-
-void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto w_value = w->mutable_value();
-  w_value->Resize({rows_numel, 10});
-  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
-
-  auto ptr = w_value->mutable_data<float>(*place);
-
-  for (int64_t i = 0; i < w_value->numel(); ++i) {
-    ptr[i] = static_cast<float>(i / 10);
-  }
-}
-
-void StartServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared;
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-
-  //  distributed::HeartBeatMonitor::Init(1, true, "w@grad");
-
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-void StartSendAndRecvServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-  auto block = AppendSendAndRecvBlock(&program);
-  std::string in_var_name("x");
-  std::vector<int> prefetch_block_ids{block->ID()};
-  auto prepared = exe.Prepare(program, prefetch_block_ids);
-  InitTensorsOnServer(&scope, &place, 10);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      grad_to_prepared_ctx;
-  grad_to_prepared_ctx[in_var_name] = prepared[0];
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-TEST(COMPLETE, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-  g_req_handler.reset(
-      new distributed::RequestSendHandler(distributed::DistributedMode::kSync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-  std::thread server_thread(StartServer, distributed::kRequestSend);
-  g_rpc_service->WaitServerReady();
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-  client->AsyncSendComplete(ep);
-  client->Wait();
-
-  EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
-
-TEST(SENDANDRECV, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-  g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
-      distributed::DistributedMode::kAsync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-  std::thread server_thread(StartSendAndRecvServer,
-                            distributed::kRequestSendAndRecv);
-  g_rpc_service->WaitServerReady();
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-
-  framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-
-  // create var on local scope
-  int64_t rows_numel = 10;
-  InitTensorsOnClient(&scope, &place, rows_numel);
-  std::string in_var_name("x");
-  std::string out_var_name("res");
-
-  client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
-  client->Wait();
-  auto var = scope.Var(out_var_name);
-  auto value = var->GetMutable<framework::LoDTensor>();
-  auto ptr = value->mutable_data<float>(place);
-
-  for (int64_t i = 0; i < rows_numel; ++i) {
-    EXPECT_EQ(ptr[i], 0.5);
-  }
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  LOG(INFO) << "begin reset";
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
-
-void StartCheckpointServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-
-  std::vector<distributed::SparseMeta> metas;
-
-  auto meta = distributed::SparseMeta();
-  meta.name = "embedding.block0";
-  meta.value_names = {"Param"};
-  meta.value_dims = {64};
-  meta.mode = distributed::Mode::training;
-  meta.grad_name = "embedding@Grad";
-  meta.cached_varnames = {"kSparseIds"};
-  meta.initializer_attrs = {"fill_constant&1.0"};
-  meta.entry = "none";
-
-  metas.push_back(meta);
-  distributed::LargeScaleKV::Init(metas);
-
-  auto* ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get("embedding.block0")->Init({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared;
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-TEST(LARGE_SCALE_CHECKPOINT, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  g_req_handler.reset(new distributed::RequestCheckpointHandler(
-      distributed::DistributedMode::kAsync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-
-  std::thread server_thread(StartCheckpointServer,
-                            distributed::kRequestCheckpoint);
-  g_rpc_service->WaitServerReady();
-
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-
-  auto save_path =
-      paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/base",
-                              "embedding", "embedding.block0");
-  int mode = 0;
-  client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode);
-  client->Wait();
-
-  save_path =
-      paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/delta",
-                              "embedding", "embedding.block0");
-  mode = 1;
-  client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode);
-  client->Wait();
-
-  paddle::framework::AttributeMap attrs;
-
-  std::vector<std::string> eps = {ep};
-  attrs["endpoints"] = eps;
-  attrs["dirname"] = std::string("/tmp/large_scale_table/delta1");
-  attrs["varname"] = std::string("embedding");
-  attrs["mode"] = 2;
-  std::vector<std::string> slices = {"embedding.block0"};
-  attrs["slice_varnames"] = slices;
-  std::vector<std::string> remotes = {"embedding.block0"};
-  attrs["remote_varnames"] = remotes;
-
-  auto ops =
-      framework::OpRegistry::CreateOp("checkpoint_notify", {}, {}, attrs, true);
-  ops->Run(scope, place);
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  LOG(INFO) << "begin reset";
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
deleted file mode 100644
index a333642bd16fbf..00000000000000
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
-the Apache License, Version 2.0 (the "License"); you may not use this file
-except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto3";
-package sendrecv;
-
-option cc_generic_services = @cc_generic_services@;
-
-service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
-  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
-  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
-  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
-
-  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
-  rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
-  rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
-}
-
-// It can be: LoDTensor、SelectedRows or NCCL_ID
-enum VarType {
-  LOD_TENSOR = 0;
-  SELECTED_ROWS = 1;
-  NCCL_ID = 2;
-}
-
-// VariableMessage is serialized paddle variable message.
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
-message VariableMessage {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-  }
-
-  message LodData { repeated int64 lod_data = 1; }
-  string varname = 1;
-  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
-  VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
-  Type data_type = 3;
-  repeated int64 dims = 4;
-
-  // lod details:
-  int64 lod_level = 5;
-  repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
-  int64 slr_height = 7;
-  // tensor data
-  bytes serialized = 8;
-  // selected_rows data
-  bytes rows = 9;
-  // Look up table block execution output variable name.
-  string out_varname = 10;
-  // If 1, the ps server will start profiling, the ps
-  // server stops profiling and generates a profile to /tmp/profile_ps_*
-  // when profile switches from 1 to 2.
-  int64 profile = 11;
-  int64 trainer_id = 12;
-  string table_name = 13;
-}
-
-message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
deleted file mode 100644
index 107c74eb2670e4..00000000000000
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
-DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
-DEFINE_int32(rpc_retry_bind_port, 3,
-             "Retry to bind the address if address is already used.");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using VarMsg = sendrecv::VariableMessage;
-
-static TensorPayload GetCommunicationAllocationFromTensor(
-    const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
-  if (is_gpu_place(ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_EQ(
-        is_gpu_place(tensor.place()), true,
-        platform::errors::PreconditionNotMet("Please run in gpu place."));
-    auto& gpu_dev_ctx =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
-    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    platform::CUDAPinnedPlace cuda_pinned;
-    auto result = memory::AllocShared(cuda_pinned, copy_size);
-
-    memory::Copy(cuda_pinned, result->ptr(),
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor.place()),
-                 tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
-    ctx.Wait();
-    return TensorPayload(result);
-#else
-    PADDLE_THROW(
-        platform::errors::Unavailable("This situation should not be happened"));
-#endif
-  } else {
-    return TensorPayload(tensor);
-  }
-}
-TensorPayload GetTensorPayload(framework::Variable* var,
-                               const platform::DeviceContext& ctx,
-                               VarMsg* request) {
-  auto tensor = var->Get<framework::LoDTensor>();
-  // FIXME(wuyi): data types in send_recv.proto is copied from
-  // framework.proto
-  request->set_data_type(static_cast<VarMsg::Type>(tensor.type()));
-  for (auto& dim : framework::vectorize(tensor.dims())) {
-    request->add_dims(dim);
-  }
-  const framework::LoD lod = tensor.lod();
-  if (lod.size() > 0) {
-    request->set_lod_level(lod.size());
-    for (auto& each : lod) {
-      VarMsg::LodData* lod_inner = request->add_lod();
-      for (auto& d : each) {
-        lod_inner->add_lod_data(d);
-      }
-    }
-  }
-  return GetCommunicationAllocationFromTensor(ctx, tensor);
-}
-
-TensorPayload GetSelectedRowsPayload(framework::Variable* var,
-                                     const platform::DeviceContext& ctx,
-                                     VarMsg* request) {
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  request->set_data_type(static_cast<VarMsg::Type>(slr->value().type()));
-  request->set_lod_level(0);
-  request->set_slr_height(slr->height());
-
-  for (auto& dim : framework::vectorize(slr->value().dims())) {
-    request->add_dims(dim);
-  }
-
-  auto* tensor = slr->mutable_value();
-  return GetCommunicationAllocationFromTensor(ctx, *tensor);
-}
-
-TensorPayload::TensorPayload(std::shared_ptr<memory::Allocation> allocation)
-    : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {}
-TensorPayload::TensorPayload(const framework::Tensor& tensor)
-    : allocation_(tensor.Holder()),
-      offset_(tensor.offset()),
-      memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {}
-void* TensorPayload::ptr() const {
-  return reinterpret_cast<void*>(
-      reinterpret_cast<uintptr_t>(allocation_->ptr()) + offset_);
-}
-size_t TensorPayload::memory_size() const { return memory_size_; }
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
deleted file mode 100644
index 84ed1ab0247124..00000000000000
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <memory>
-#include <string>
-#include <typeindex>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace framework {
-class Tensor;
-class Variable;
-}  // namespace framework
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using VarMsg = sendrecv::VariableMessage;
-
-class TensorPayload final {
- public:
-  explicit TensorPayload(const framework::Tensor& tensor);
-  explicit TensorPayload(std::shared_ptr<memory::Allocation> allocation);
-
-  TensorPayload(const TensorPayload& o) = default;
-  TensorPayload& operator=(const TensorPayload& o) = default;
-
-  void* ptr() const;
-  size_t memory_size() const;
-
- private:
-  std::shared_ptr<memory::Allocation> allocation_;
-  size_t offset_;
-  size_t memory_size_;
-};
-
-inline void SerializeDestroyCallback(void* payload) {
-  if (payload != nullptr) {
-    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
-    delete shared_payload;
-  }
-}
-
-TensorPayload GetTensorPayload(framework::Variable* var,
-                               const platform::DeviceContext& ctx,
-                               VarMsg* request);
-
-TensorPayload GetSelectedRowsPayload(framework::Variable* var,
-                                     const platform::DeviceContext& ctx,
-                                     VarMsg* request);
-
-inline framework::proto::VarType::Type ToVarType(
-    sendrecv::VariableMessage::Type type) {
-  switch (type) {
-    case sendrecv::VariableMessage::FP32:
-      return framework::proto::VarType::FP32;  // NOLINT
-    case sendrecv::VariableMessage::FP64:
-      return framework::proto::VarType::FP64;  // NOLINT
-    case sendrecv::VariableMessage::INT32:
-      return framework::proto::VarType::INT32;  // NOLINT
-    case sendrecv::VariableMessage::INT64:
-      return framework::proto::VarType::INT64;  // NOLINT
-    case sendrecv::VariableMessage::BOOL:
-      return framework::proto::VarType::BOOL;  // NOLINT
-    default:
-      PADDLE_THROW(
-          platform::errors::InvalidArgument("Not support type id: %d.", type));
-  }
-}
-
-template <template <typename> class T, typename Elem>
-std::string VectorElemName(const T<Elem>& arg) {
-  return typeid(Elem).name();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc
deleted file mode 100644
index 7c52ef74b4c2e5..00000000000000
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-using paddle::operators::distributed::VarHandlePtr;
-using paddle::operators::distributed::VarHandle;
-
-void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
-
-void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
-
-TEST(VarHandle, Run) {
-  std::vector<VarHandlePtr> a;
-  for (int i = 0; i < 12; i++) {
-    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
-    a.push_back(s);
-  }
-
-  std::vector<std::unique_ptr<std::thread>> t;
-  for (int i = 0; i < 6; i++) {
-    t.emplace_back(new std::thread(WaitFalse, a[i]));
-  }
-
-  for (int i = 0; i < 6; i++) {
-    a[i]->Finish(false);
-    t[i]->join();
-  }
-
-  for (int i = 6; i < 12; i++) {
-    t.emplace_back(new std::thread(WaitTrue, a[i]));
-  }
-
-  for (int i = 6; i < 12; i++) {
-    a[i]->Finish(true);
-    t[i]->join();
-  }
-}
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
deleted file mode 100644
index 79b0843968e857..00000000000000
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include <vector>
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-DEFINE_string(rpc_server_profile_path, "./profile_ps",
-              "the profile log file path");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
-                               const platform::DeviceContext& dev_ctx,
-                               platform::Place place, void* dest,
-                               int64_t size) {
-  const void* data = NULL;
-  int size_to_write = 0;
-  int64_t length = size;
-  int total_written = 0;
-
-  if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto& gpu_dev_ctx =
-        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-    platform::CPUPlace cpu;
-
-    char* p = reinterpret_cast<char*>(dest);
-    while (total_written < length) {
-      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-        return false;
-      }
-      // NOTE: if raw buffer is large and have two neighbor fields of raw
-      // buffers GetDirectBufferPointer can get all of them, use length to
-      // truncate it.
-      if (total_written + size_to_write > length) {
-        size_to_write = length - total_written;
-      }
-      // This log is useful to see how long a internal block size is of rpc.
-      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place),
-                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
-                   gpu_dev_ctx.stream());
-      p += size_to_write;
-      total_written += size_to_write;
-
-      input->Skip(size_to_write);
-    }
-    gpu_dev_ctx.Wait();
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Unexpected branch, please compile with WITH_GPU or WITH_ROCM"));
-#endif
-    return true;
-  } else if (platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-    auto& xpu_dev_ctx = static_cast<const platform::XPUDeviceContext&>(dev_ctx);
-    platform::CPUPlace cpu;
-    char* p = reinterpret_cast<char*>(dest);
-    while (total_written < length) {
-      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-        return false;
-      }
-
-      if (total_written + size_to_write > length) {
-        size_to_write = length - total_written;
-      }
-
-      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place),
-                   reinterpret_cast<void*>(p), cpu, data, size_to_write);
-      p += size_to_write;
-      total_written += size_to_write;
-      input->Skip(size_to_write);
-    }
-    xpu_dev_ctx.Wait();
-#else
-    PADDLE_ENFORCE_NOT_NULL(
-        nullptr,
-        platform::errors::Unimplemented(
-            "Not supported XPU, please compile with option WITH_XPU=ON."));
-#endif
-    return true;
-  }
-
-  char* p = reinterpret_cast<char*>(dest);
-  while (total_written < length) {
-    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-      return false;
-    }
-    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
-    // GetDirectBufferPointer can get all of them, use length to truncate it.
-    if (total_written + size_to_write > length) {
-      size_to_write = length - total_written;
-    }
-    // TODO(gongwb): can we avoid copy?
-    platform::CPUPlace cpu;
-    // This log is useful to see how long a internal block size is of rpc.
-    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
-    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
-
-    p += size_to_write;
-    total_written += size_to_write;
-
-    input->Skip(size_to_write);
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopyLodTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto server_var = GetVar();
-  if (!server_var) {
-    LOG(ERROR) << "recved var should not on current server: "
-               << meta_.varname();
-    return false;
-  }
-  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
-  tensor->Resize(dims);
-  framework::LoD lod;
-  for (int i = 0; i < meta_.lod_level(); ++i) {
-    framework::Vector<size_t> v;
-    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
-      v.push_back(meta_.lod(i).lod_data(j));
-    }
-    lod.push_back(v);
-  }
-  tensor->set_lod(lod);
-
-  void* tensor_data =
-      tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
-
-  VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
-          << ", Buffer Size = " << length << ", dims:" << dims
-          << ", numel:" << tensor->numel();
-  PADDLE_ENFORCE_GE(
-      tensor->memory_size(), static_cast<unsigned int>(length),
-      platform::errors::InvalidArgument(
-          "The memory size of tensor: %s should greater than length: %s",
-          tensor->memory_size(), length));
-  return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
-}
-
-inline framework::DDim GetDims(
-    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
-  std::vector<int> vecdims;
-  for (auto& d : dims) {
-    vecdims.push_back(d);
-  }
-  return framework::make_ddim(vecdims);
-}
-
-bool VariableResponse::CopySelectRowsTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->set_height(meta_.slr_height());
-  auto* tensor = slr->mutable_value();
-  tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(
-      static_cast<size_t>(tensor->numel()),
-      length / framework::SizeOfType(paddle::operators::distributed::ToVarType(
-                   meta_.data_type())),
-      platform::errors::InvalidArgument(
-          "length: %s should equal to memory size of tensor: %s", length,
-          tensor->numel() *
-              framework::SizeOfType(paddle::operators::distributed::ToVarType(
-                  meta_.data_type()))));
-  void* tensor_data = tensor->mutable_data(
-      ctx.GetPlace(),
-      paddle::operators::distributed::ToVarType(meta_.data_type()));
-
-  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopySelectRowsData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->mutable_rows()->clear();
-  slr->mutable_rows()->resize(length / sizeof(int64_t));  // int64
-  int64_t* rows_data = slr->mutable_rows()->data();
-
-  // copy rows CPU data, GPU data will be copied lazily.
-  platform::CPUPlace cpu;
-  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::ProcSerializedField(
-    int tag, ::google::protobuf::io::CodedInputStream* input,
-    int64_t num_bytes) {
-  PADDLE_ENFORCE(
-      (meta_.type() == sendrecv::SELECTED_ROWS ||
-       meta_.type() == sendrecv::LOD_TENSOR ||
-       meta_.type() == sendrecv::NCCL_ID) &&
-          meta_.varname() != "",
-      platform::errors::PreconditionNotMet("meta info should be got first!"));
-
-  if (meta_.type() == sendrecv::NCCL_ID) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* var = scope_->FindVar(meta_.varname());
-    if (var != nullptr) {
-      ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-      if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                   num_bytes)) {
-        return false;
-      }
-    }
-    return true;
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Please compiled with CUDA!"));
-    return false;
-#endif
-  }
-
-  VLOG(7) << "ProcSerializedField:" << meta_.varname()
-          << ", type:" << meta_.type() << std::endl;
-  framework::DDim dims = GetDims(meta_.dims());
-  if (meta_.type() == sendrecv::LOD_TENSOR) {
-    PADDLE_ENFORCE_GE(
-        meta_.lod_size(), 0,
-        platform::errors::PreconditionNotMet("lod info should be got first!"));
-    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
-      return false;
-    }
-
-    return true;
-  }
-
-  if (meta_.type() == sendrecv::SELECTED_ROWS) {
-    if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
-      return false;
-    }
-    return true;
-  }
-
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "The type: %s of var: %s is not supported", meta_.type(),
-      meta_.varname()));
-
-  return false;
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
deleted file mode 100644
index be67a2396f7d71..00000000000000
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ /dev/null
@@ -1,155 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-namespace google {
-namespace protobuf {
-namespace io {
-class CodedInputStream;
-class ZeroCopyInputStream;
-}  // namespace io
-}  // namespace protobuf
-}  // namespace google
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_string(rpc_server_profile_path);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};
-
-class VariableResponse {
- public:
-  VariableResponse(const framework::Scope* scope,
-                   const platform::DeviceContext* dev_ctx,
-                   bool create_scope = false)
-      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
-    if (create_scope) {
-      local_scope_ = scope->NewTmpScope().release();
-    }
-  }
-
-  virtual ~VariableResponse() {
-    if (local_scope_) {
-      delete local_scope_;
-      local_scope_ = nullptr;
-    }
-  }
-
-  int Parse(Source* source, const sendrecv::VariableMessage& meta) {
-    meta_ = meta;
-    return Parse(source);
-  }
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  virtual int Parse(Source* source) = 0;
-
-  inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
-  inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
-  inline std::string Varname() const { return meta_.varname(); }
-  inline std::string OutVarname() const { return meta_.out_varname(); }
-  inline std::string TableName() const { return meta_.table_name(); }
-
-  // should call parse first.
-  framework::Variable* GetVar() {
-    if (create_scope_) {
-      return local_scope_->Var(meta_.varname());
-    }
-    return scope_->FindVar(meta_.varname());
-  }
-
-  framework::Variable* GetRecvVar() {
-    if (create_scope_) {
-      return local_scope_->Var(meta_.out_varname());
-    }
-    return scope_->FindVar(meta_.out_varname());
-  }
-
-  int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
-
- protected:
-  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-               const platform::DeviceContext& dev_ctx, platform::Place place,
-               void* dest, int64_t size);
-
-  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
-                                const platform::DeviceContext& ctx,
-                                const framework::DDim& dims, int length);
-
-  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
-                          const platform::DeviceContext& ctx, int length);
-
-  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
-                         const platform::DeviceContext& ctx,
-                         const framework::DDim& dims, int length);
-
-  bool ProcSerializedField(int tag,
-                           ::google::protobuf::io::CodedInputStream* input,
-                           int64_t num_bytes);
-
- protected:
-  const framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
-  bool create_scope_ = false;
-  framework::Scope* local_scope_ = nullptr;
-
-  sendrecv::VariableMessage meta_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
deleted file mode 100644
index e651f19fedbcf8..00000000000000
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-include(operators)
-
-set(DISTRIBUTE_DEPS "")
-if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr zlib protobuf node)
-else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb protobuf ssl crypto zlib node)
-    if(WITH_BRPC_RDMA)
-        find_library(IBVERBS_LIBRARY NAMES ibverbs)
-        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-
-
-        find_library(RDMACM_LIBRARY NAMES rdmacm)
-        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-
-        set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
-    endif()
-endif()
-
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-list(REMOVE_DUPLICATES OPS)
-
-foreach(src ${OPS})
-    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-endforeach()
-
-register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
-
-if(WITH_NCCL OR WITH_RCCL)
-    set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
-endif()
-
-set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
-set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
deleted file mode 100644
index 86f1c28a9dd4f5..00000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be allreduced.");
-    AddOutput("Out", "(Tensor) the result of allreduced.");
-    AddAttr<int>("reduce_type", "(int) determin the reduce type.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-***AllReduce Operator***
-
-Call NCCL AllReduce internally. Note that this op must be used when one
-thread is managing one GPU device.
-
-For speed reasons, reduce_type should be an integer:
-
-0: sum
-1: prod
-2: max
-3: min
-
-If input and output are the same variable, in-place allreduce will be used.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp,
-                             ops::AllReduceOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    allreduce, ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
deleted file mode 100644
index 9b70f78399026b..00000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    allreduce, ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
deleted file mode 100644
index 157924f08546bf..00000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AllReduceOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "AllReduce op can run on gpu place only for now."));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    int dtype = platform::ToNCCLDataType(in->type());
-    int64_t numel = in->numel();
-    auto* sendbuff = in->data<void>();
-    out->Resize(in->dims());
-    void* recvbuff = out->mutable_data<T>(place);
-
-    auto* comm = dev_ctx.nccl_comm();
-    // FIXME(typhoonzero): should use nccl stream here.
-    auto stream = dev_ctx.stream();
-    PADDLE_ENFORCE_NOT_NULL(
-        stream, platform::errors::NotFound("Should initialize NCCL firstly."));
-
-    int reduce_type = ctx.Attr<int>("reduce_type");
-    ncclRedOp_t red_type = ncclSum;
-    switch (reduce_type) {
-      case 0:
-        red_type = ncclSum;
-        break;
-      case 1:
-        red_type = ncclProd;
-        break;
-      case 2:
-        red_type = ncclMax;
-        break;
-      case 3:
-        red_type = ncclMin;
-        break;
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
-        comm, stream));
-    if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
deleted file mode 100644
index 61e27887b68c75..00000000000000
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <ostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class BroadcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of BroadcastOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Output) of ConvOp should not be null."));
-  }
-};
-
-class BroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be broadcast.");
-    AddOutput("Out", "(Tensor) the result of broadcast.");
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddAttr<int>("root", "(int).").SetDefault(0).EqualGreaterThan(0);
-    AddComment(R"DOC(
-***Broadcast Operator***
-
-Call NCCL Broadcast internally. Note that this op must be used when one
-thread is managing one GPU device.
-)DOC");
-  }
-};
-
-template <typename T>
-class BroadcastOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Broadcast op can run on gpu place only for now."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(broadcast, ops::BroadcastOp,
-                             ops::BroadcastOpMaker);
-
-REGISTER_OP_CPU_KERNEL(broadcast, ops::BroadcastOpKernel<float>,
-                       ops::BroadcastOpKernel<double>,
-                       ops::BroadcastOpKernel<int>,
-                       ops::BroadcastOpKernel<int64_t>,
-                       ops::BroadcastOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
deleted file mode 100644
index 1bfcc8af03e1e7..00000000000000
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet(
-            "The place of ExecutionContext should be CUDAPlace."));
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
-    int root_dev_id = ctx.Attr<int>("root");
-
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        out->IsInitialized(), true,
-        platform::errors::PreconditionNotMet(
-            "Currently, the output of broadcast op must be initialized,"
-            "because this op can only be an In-Place operation."));
-    void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
-    PADDLE_ENFORCE_EQ(
-        send_recv_buffer, in->data<void>(),
-        platform::errors::PreconditionNotMet("Currently, the broadcast op can "
-                                             "only be an In-Place operation."));
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto comm = dev_ctx.nccl_comm();
-    auto stream = dev_ctx.stream();
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
-        send_recv_buffer, static_cast<size_t>(in->numel()),
-        platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
-
-    VLOG(3) << "Bcast " << ctx.InputNames("X")[0] << ", (" << in->numel() << ")"
-            << " From " << root_dev_id << " to " << dev_id;
-
-    if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(broadcast, ops::NCCLBroadcastOpKernel<float>,
-                        ops::NCCLBroadcastOpKernel<double>,
-                        ops::NCCLBroadcastOpKernel<int>,
-                        ops::NCCLBroadcastOpKernel<int64_t>,
-                        ops::NCCLBroadcastOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
deleted file mode 100644
index 051d9d65c7714a..00000000000000
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class CheckpointNotifyOp : public framework::OperatorBase {
- public:
-  CheckpointNotifyOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> epmap =
-        Attr<std::vector<std::string>>("endpoints");
-    std::string dirname = Attr<std::string>("dirname");
-    std::string varname = Attr<std::string>("varname");
-    auto mode = Attr<int>("mode");
-
-    if (mode != 0 && mode != 1 && mode != 2) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "mode expected in [0/1/2], but got %d", mode));
-    }
-
-    std::vector<std::string> slice_varnames =
-        Attr<std::vector<std::string>>("slice_varnames");
-
-    std::vector<std::string> remote_varnames =
-        Attr<std::vector<std::string>>("remote_varnames");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-    for (size_t i = 0; i < epmap.size(); i++) {
-      auto save_path =
-          string::Sprintf("%s/%s/%s", dirname, varname, slice_varnames[i]);
-
-      rpc_client->AsyncCheckpointNotify(epmap[i], save_path, remote_varnames[i],
-                                        mode);
-
-      VLOG(3) << "checkpoint notify sending with path: " << save_path
-              << " and var:" << slice_varnames[i] << " to " << epmap[i]
-              << " with mode " << mode;
-    }
-    PADDLE_ENFORCE_EQ(
-        rpc_client->Wait(), true,
-        platform::errors::Fatal("Fail to notify checkpoint."
-                                " Internal error occurs in RPCClient."));
-  }
-};
-
-class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddAttr<std::vector<std::string>>(
-        "endpoints",
-        "(string vector)"
-        "Parameter Server endpoints in the order");
-    AddAttr<std::string>("dirname",
-                         "(string) indicate the folder checkpoint will use");
-    AddAttr<std::string>("varname", "(string)  the var need to be saved");
-    AddAttr<std::vector<std::string>>(
-        "slice_varnames", "(string vector) the slice vars need to be saved");
-    AddAttr<std::vector<std::string>>(
-        "remote_varnames", "(string vector) the slice vars need to be saved");
-    AddAttr<int>("mode", "mode=0/1/2 means nothing/save base/save delta")
-        .SetDefault(0);
-    AddComment(R"DOC(
-CheckpointNotify operator
-This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
-the parameter server.
-)DOC");
-  }
-};
-
-class CheckpointNotifyOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    checkpoint_notify, ops::CheckpointNotifyOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::CheckpointNotifyOpMaker, ops::CheckpointNotifyOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
deleted file mode 100644
index 6dfa2670c140fc..00000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class DistributedLookupTableOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Ids) of LookupTableOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(W) of LookupTableOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Outs) of LookupTableOp should not be null."));
-
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    auto table_dims = ctx->GetInputDim("W");
-
-    PADDLE_ENFORCE_EQ(
-        table_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Only 2 dimensions of the 'Embedding' is supported."));
-
-    for (auto &ids_dim : ids_dims) {
-      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
-                        platform::errors::InvalidArgument(
-                            "The dimension of the 'Ids' tensor must be 2."));
-    }
-
-    auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
-    // for fluid.embedding
-    auto lookup_table_version =
-        ctx->Attrs().Get<std::string>("lookup_table_version");
-
-    auto outputs_dims = std::vector<framework::DDim>();
-
-    for (auto &ids_dim : ids_dims) {
-      if (lookup_table_version == "lookup_table") {
-        outputs_dims.push_back(
-            framework::make_ddim({ids_dim[0], table_dims[1]}));
-      } else if (lookup_table_version == "lookup_table_v2") {
-        outputs_dims.push_back(framework::make_ddim(
-            {static_cast<int64_t>(ids_dim[0]), static_cast<int64_t>(ids_dim[1]),
-             static_cast<int64_t>(table_dims[1])}));
-      }
-    }
-
-    ctx->SetOutputsDim("Outputs", outputs_dims);
-    ctx->ShareLoD("Ids", /*->*/ "Outputs");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.")
-        .AsDuplicable();
-
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-
-    AddOutput("Outputs",
-              "(LoDTensor) The lookup results, which have the same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, such as emb_block0, emb_block1)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({""});
-    AddAttr<std::vector<std::string>>(
-        "endpoints",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-
-    AddAttr<int>("pserver_num", "the number of pserver").SetDefault(0);
-
-    AddAttr<bool>("is_distributed",
-                  "(boolean, default false) distributed lookup table.")
-        .SetDefault(false);
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-
-    AddAttr<std::string>(
-        "lookup_table_version",
-        "(string, default lookup_table) "
-        "To distinguish between different versions of embedding OP")
-        .SetDefault(std::string("lookup_table"));
-
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(distributed::kNoPadding);
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-
-    AddComment(R"DOC(
-Lookup Tablel Prefetch Operator.
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
-                  ops::DistributedLookupTableOpMaker);
-
-REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
-                       ops::DistributedLookupTableKernel<
-                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
deleted file mode 100644
index 54c894415096e8..00000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    distributed_lookup_table,
-    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
deleted file mode 100644
index 6387120bc87fc9..00000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DistributedLookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto ids_vars = context.MultiInputVar("Ids");
-    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
-
-    auto id_names = context.InputNames("Ids");
-    auto embedding_name = context.InputNames("W").front();
-    auto out_names = context.OutputNames("Outputs");
-    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
-    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
-    auto is_distributed = context.Attr<bool>("is_distributed");
-
-    auto lookup_table_version =
-        context.Attr<std::string>("lookup_table_version");
-
-    operators::distributed::prefetchs(id_names, out_names, embedding_name,
-                                      is_distributed, lookup_tables, endpoints,
-                                      context, context.scope());
-
-    if (lookup_table_version == "lookup_table_v2") {
-      auto &scope = context.scope();
-      auto emb_dim =
-          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
-
-      for (size_t i = 0; i < id_names.size(); ++i) {
-        auto *id_var = scope.FindVar(id_names[i]);
-        auto *out_var = scope.FindVar(out_names[i]);
-        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
-        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-        auto id_dims = id_tensor->dims();
-        out_tensor->Resize(framework::make_ddim(
-            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
-             static_cast<int64_t>(emb_dim)}));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
deleted file mode 100644
index cb27dc75eb2faf..00000000000000
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class FakeInitInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FakeInit");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    ctx->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-};
-
-class FakeInitOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    framework::Tensor *tensor = nullptr;
-
-    auto &out_var = *scope.FindVar(Output("Out"));
-
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "fake init op's output only"
-          "supports SelectedRows and LoDTensor"));
-    }
-  }
-};
-
-class FakeInitOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output");
-    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
-              "with the specified value");
-    AddComment(R"DOC(
-FakeInit Operator.
-Init an variable but not alloc memory for it, it is used for init the
-table parameter at trainer side in distributed lookup table.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    fake_init, ops::FakeInitOp, ops::FakeInitInferShape, ops::FakeInitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::FakeInitOpVarTypeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
deleted file mode 100644
index 755cbf017d9d4b..00000000000000
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class FetchBarrierOp : public framework::OperatorBase {
- public:
-  FetchBarrierOp(const std::string& type,
-                 const framework::VariableNameMap& inputs,
-                 const framework::VariableNameMap& outputs,
-                 const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (auto& ep : eps) {
-      VLOG(3) << "fetch barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendFetchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
-                        platform::errors::Unavailable(
-                            "Internal error occurred in RPCClient."));
-    }
-  }
-};
-
-class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDispensable()
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SendBarrier operator
-
-This operator will send a send barrier signal to list_and_serv op, so that
-the Parameter Server would knew all variables have been sent.
-)DOC");
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class FetchBarrierOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    fetch_barrier, ops::FetchBarrierOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::FetchBarrierOpMaker, ops::FetchBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
deleted file mode 100644
index 2e54bb3961cd29..00000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(flrpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(flrpc_get_thread_num, 12, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-}
-static void flsplit(const std::string &str, char sep,
-                    std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void FlParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Run %d-th sub program failed. The exception is:\n%s.", idx,
-            e.what()));
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-FlListenAndServOp::FlListenAndServOp(const std::string &type,
-                                     const framework::VariableNameMap &inputs,
-                                     const framework::VariableNameMap &outputs,
-                                     const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-FlListenAndServOp::~FlListenAndServOp() {}
-
-void FlListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-void FlListenAndServOp::RunSyncLoop(framework::Executor *executor,
-                                    framework::ProgramDesc *program,
-                                    framework::Scope *recv_scope,
-                                    platform::DeviceContext *dev_ctx) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::InvalidArgument(
-                        "server program should have at least 2 blocks"));
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to get pserver parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-
-    if (rpc_service_->IsExit()) {
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    VLOG(3) << "wait all clients to send after_optimizer parameters";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                                program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                            program, recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-  }  // while(true)
-}
-
-static void FillRequestCtx(distributed::RequestHandler *h,
-                           framework::Scope *scope,
-                           platform::DeviceContext *dev_ctx,
-                           framework::Executor *executor,
-                           framework::ProgramDesc *program,
-                           distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetRPCServer(rpc_server);
-}
-
-void FlListenAndServOp::RunImpl(const framework::Scope &scope,
-                                const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  bool sync_mode = Attr<bool>("sync_mode");
-  auto fan_in = Attr<int>("Fanin");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE_EQ(!rpc_service_, true, platform::errors::InvalidArgument(
-                                             "rpc_service_ must null"));
-  std::string endpoint = Attr<std::string>("endpoint");
-
-  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(!sync_mode, false));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(!sync_mode, false));
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(),
-                            FLAGS_flrpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(),
-                            FLAGS_flrpc_get_thread_num);
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(
-      optimize_blocks.size(), 1,
-      platform::errors::InvalidArgument(
-          "optimize blocks should be 1 at least on the pserver side."));
-  auto *program = optimize_blocks[0]->Program();
-  framework::Executor executor(dev_place);
-
-  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-
-  // start the server listening after all member initialized.
-  server_thread_.reset(new std::thread(FlRunServer, rpc_service_));
-  VLOG(3) << "wait server thread to become ready...";
-  rpc_service_->WaitServerReady();
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, FlSignalHandler::StopAndExit);
-  signal(SIGTERM, FlSignalHandler::StopAndExit);
-
-  // Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
-  // so that we can reset them at the end of each iteration.
-  // NOTE: only used in sync update
-
-  // Write to a file of server selected port for python use.
-  SavePort();
-  RunSyncLoop(&executor, program, &recv_scope, &dev_ctx);
-}
-
-class FlListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-  }
-};
-
-void FlSignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(fl_listen_and_serv, ops::FlListenAndServOp,
-                             ops::FlListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
deleted file mode 100644
index 25ad16e3fce378..00000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Executor;
-class ProgramDesc;
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCServer;
-class RequestHandler;
-}  // namespace distributed
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class FlListenAndServOp : public framework::OperatorBase {
- public:
-  FlListenAndServOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs);
-  virtual ~FlListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class FlSignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(FlSignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
deleted file mode 100644
index db8c2f3f2d8660..00000000000000
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ /dev/null
@@ -1,313 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <ostream>
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-class GenNCCLIdOp : public framework::OperatorBase {
- public:
-  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& outputs,
-              const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    // put nccl id in CPUPlace
-    auto& dev_ctx = *pool.Get(platform::CPUPlace());
-    int trainer_id = Attr<int>("trainer_id");
-
-    std::vector<std::string> trainers =
-        Attr<std::vector<std::string>>("trainers");
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
-    PADDLE_ENFORCE_LT(
-        trainer_id, static_cast<int>(trainers.size()),
-        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
-                                     "range is [0, trainer_size)",
-                                     trainer_id));
-
-    std::string endpoint = trainers[trainer_id];
-
-    framework::Scope& local_scope = scope.NewScope();
-
-    int nccl_comm_num = Attr<int>("nccl_comm_num");
-    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
-    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
-
-    int inter_trainer_id = -1;
-    int exter_trainer_id = -1;
-    if (use_hierarchical_allreduce) {
-      PADDLE_ENFORCE_GT(
-          trainers.size(), 1,
-          platform::errors::PreconditionNotMet(
-              "The number of collective trainers %llu <= 1", trainers.size()));
-      PADDLE_ENFORCE_GT(
-          inter_nranks, 1,
-          platform::errors::PreconditionNotMet(
-              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
-              inter_nranks));
-      PADDLE_ENFORCE_EQ(
-          trainers.size() % inter_nranks, 0,
-          platform::errors::PreconditionNotMet(
-              "The number of trainers %llu mod inter_nranks %d is not equal 0",
-              trainers.size(), inter_nranks));
-
-      inter_trainer_id = trainer_id % inter_nranks;
-
-      if (trainer_id % inter_nranks == 0) {
-        exter_trainer_id = trainer_id / inter_nranks;
-      }
-    }
-
-    if (trainer_id != 0) {
-      GetIdByServer(endpoint, &local_scope, dev_ctx, nccl_comm_num,
-                    use_hierarchical_allreduce, trainer_id, inter_trainer_id,
-                    exter_trainer_id);
-    }
-
-    std::ostringstream ss;
-    for (size_t i = 0; i < trainers.size(); i++) {
-      ss << trainers[i] << ",";
-    }
-
-    VLOG(1) << "trainer_id:" << trainer_id
-            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
-            << ", inter_nranks:" << inter_nranks
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << ", trainers:" << ss.str();
-
-    // init flat
-    if (trainer_id == 0) {
-      std::vector<std::string> flat_endpoints;
-      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
-                            trainers.end());
-      // flat nccl_id
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string var_name = platform::GetFlatNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, var_name, flat_endpoints);
-      }
-    }
-
-    if (!use_hierarchical_allreduce) {
-      return;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        trainers.size() % inter_nranks, 0,
-        platform::errors::PreconditionNotMet(
-            "The number of trainers %llu mod inter_nranks %d is not equal 0",
-            trainers.size(), inter_nranks));
-    PADDLE_ENFORCE_GT(
-        inter_nranks, 1,
-        platform::errors::PreconditionNotMet(
-            "inter_nranks %d <= 1 while in hierarchical allreduce mode",
-            inter_nranks));
-
-    // hierarchical inter ncclid
-    if (inter_trainer_id == 0) {
-      std::ostringstream ss;
-      ss << endpoint;
-      std::vector<std::string> inter_endpoints;
-      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
-                                   i < static_cast<int>(trainers.size());
-           i++) {
-        ss << ",";
-        inter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalInterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, inter_endpoints);
-      }
-    }
-
-    // hierarchical exter ncclid
-    if (exter_trainer_id == 0) {
-      std::ostringstream ss;
-      std::vector<std::string> exter_endpoints;
-      ss << endpoint;
-      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
-        ss << ",";
-        exter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalExterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, exter_endpoints);
-      }
-    }
-  }
-
- private:
-  void GenerateAndSend(framework::Scope* scope,
-                       const platform::DeviceContext& dev_ctx,
-                       const std::string& nccl_id_name,
-                       const std::vector<std::string>& endpoint_list) const {
-    auto var = scope->FindVar(nccl_id_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::NotFound("Variable with name %s is not found",
-                                        nccl_id_name.c_str()));
-    auto id = var->GetMutable<ncclUniqueId>();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetUniqueId(id));
-
-    distributed::RPCClient* client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-    for (auto& ep : endpoint_list) {
-      VLOG(3) << "sending nccl_id_var:" << nccl_id_name << " to " << ep;
-      client->AsyncSendVar(ep, dev_ctx, *scope, nccl_id_name);
-    }
-    client->Wait();
-    for (auto& ep : endpoint_list) {
-      client->AsyncSendBatchBarrier(ep);
-    }
-    client->Wait();
-    VLOG(3) << "sending completed...";
-  }
-
-  void GetIdByServer(const std::string& endpoint, framework::Scope* scope,
-                     const platform::DeviceContext& dev_ctx, int nccl_comm_num,
-                     bool use_hierarchical_allreduce, int trainer_id,
-                     int inter_trainer_id, int exter_trainer_id) const {
-    // std::string endpoint = Attr<std::string>("endpoint");
-    // NOTE: Can not use unique_ptr here because the default
-    // deleter will call GRPC Server's base class's dtor and
-    // that will cause a wired crash.
-    distributed::RequestSendHandler rpc_h(distributed::DistributedMode::kSync);
-    std::unique_ptr<distributed::RPCServer> rpc_service(
-        new RPCSERVER_T(endpoint, 1));
-
-    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
-    rpc_h.SetRPCServer(rpc_service.get());
-
-    framework::ProgramDesc empty_program;
-    framework::Executor executor(dev_ctx.GetPlace());
-    rpc_h.SetScope(scope);
-    rpc_h.SetDevCtx(&dev_ctx);
-    rpc_h.SetProgram(&empty_program);
-    rpc_h.SetExecutor(&executor);
-
-    std::thread server_thread(
-        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
-
-    for (int i = 0; i < nccl_comm_num; i++) {
-      rpc_service->SetCond(distributed::kRequestSend);
-      VLOG(3) << "trainer_id:" << trainer_id
-              << " start getting nccl id from trainer 0, nccl_comm_no:" << i;
-      rpc_service->WaitBarrier(distributed::kRequestSend);
-      rpc_service->ResetBarrierCounter();
-    }
-
-    if (use_hierarchical_allreduce) {
-      if (inter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3) << "trainer_id:" << trainer_id
-                  << ", inter_trainer_id:" << inter_trainer_id
-                  << " start getting nccl id from inter_trainer:" << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-
-      if (exter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3)
-              << "trainer_id:" << trainer_id
-              << ", exter_trainer_id:" << exter_trainer_id
-              << " start getting nccl id from exter_trainer 0, nccl_comm_no:"
-              << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-    }
-
-    VLOG(3) << "traier_id:" << trainer_id
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << " got nccl id and stop server...";
-    rpc_service->ShutDown();
-    VLOG(3) << "rpc server stopped";
-    server_thread.join();
-  }
-};
-
-class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("NCCLID", "Raw variable contains a NCCL UniqueId instaces.");
-    AddComment(R"DOC(
-GenNCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::vector<std::string>>(
-        "trainers",
-        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
-        "list of all trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("trainer_id",
-                 "(int) "
-                 "The index of the trainer in distributed training.");
-    AddAttr<int>("nccl_comm_num",
-                 "(int default 1) "
-                 "The number of nccl communicator num.")
-        .SetDefault(1);
-    AddAttr<bool>("use_hierarchical_allreduce",
-                  "(bool default false) "
-                  "Wheter to use hierarchical allreduce.")
-        .SetDefault(false);
-    AddAttr<int>("hierarchical_allreduce_inter_nranks",
-                 "(int default 1) "
-                 "Wheter to use hierarchical allreduce.")
-        .SetDefault(-1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gen_nccl_id, ops::GenNCCLIdOp, ops::GenNCCLIdOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
deleted file mode 100644
index 43de8488a0e4ac..00000000000000
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ /dev/null
@@ -1,636 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get");
-DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch");
-
-namespace paddle {
-namespace operators {
-
-void RunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-  VLOG(4) << "RunServer thread end";
-}
-
-static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void ParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Run %d-th sub program failed. The exception is:\n%s.", idx,
-            e.what()));
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-ListenAndServOp::ListenAndServOp(const std::string &type,
-                                 const framework::VariableNameMap &inputs,
-                                 const framework::VariableNameMap &outputs,
-                                 const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-ListenAndServOp::~ListenAndServOp() { Stop(); }
-
-void ListenAndServOp::Stop() {
-  rpc_service_->ShutDown();
-  server_thread_->join();
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-}
-
-void ListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-// For sync, sparse variables need recover grad type from LodTensor to
-// SelectedRows
-void ResetSparseVarsType(framework::Scope *recv_scope) {
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  auto grads = ins->GetAllGrads();
-
-  for (auto &grad : grads) {
-    auto *v = recv_scope->FindVar(grad);
-    v->Clear();
-    v->GetMutable<framework::SelectedRows>();
-  }
-}
-
-void ListenAndServOp::RunSyncLoop(
-    framework::Executor *executor, framework::ProgramDesc *program,
-    framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
-    const std::vector<int> &prefetch_block_id_list,
-    const int checkpoint_point_block_id) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::PreconditionNotMet(
-                        "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 2. Recieved %zu",
-                        num_blocks));
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  // Trainers will get all parameters from pserver in the
-  // startup program, so we will wait RequestGet first
-  rpc_service_->SetCond(distributed::kRequestGet);
-  rpc_service_->WaitBarrier(distributed::kRequestGet);
-  rpc_service_->ResetBarrierCounter();
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to send gradient";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-
-    if (rpc_service_->IsExit()) {
-      LOG(WARNING) << "get exit!rpc_processor break!";
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                              program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
-                          recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-
-    VLOG(3) << "ResetReceivedVars";
-    ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
-    ResetSparseVarsType(recv_scope);
-
-    VLOG(3) << "wait all clients to get parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-  }  // while(true)
-}
-
-void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
-                                        platform::DeviceContext *dev_ctx,
-                                        bool reset_all) const {
-  for (auto &varname : sparse_vars_) {
-    auto var = recv_scope->FindVar(varname);
-    if (var == nullptr) {
-      VLOG(2) << "can not find var " << varname << " in received scope";
-      continue;
-    }
-    if (var->IsType<framework::SelectedRows>()) {
-      VLOG(3) << "reset sparse var: " << varname;
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "The type of sparse var should be SelectedRows"));
-    }
-  }
-  if (UNLIKELY(reset_all)) {
-    for (auto &varname : dense_vars_) {
-      auto var = recv_scope->FindVar(varname);
-      if (var == nullptr) {
-        VLOG(2) << "can not find var " << varname << " in received scope";
-        continue;
-      }
-      if (var->IsType<framework::LoDTensor>()) {
-        math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
-                           static_cast<float>(0));
-      } else if (var->IsType<framework::Tensor>()) {
-        math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
-                           static_cast<float>(0));
-      } else {
-        PADDLE_THROW(platform::errors::PreconditionNotMet(
-            "The type of dense var should be in [LoDTensor, Tensor]"));
-      }
-    }
-  }
-}
-
-void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program,
-                                   framework::Scope *recv_scope) const {
-  VLOG(2) << "RunAsyncLoop";
-  auto grad_to_block_id_str =
-      Attr<std::vector<std::string>>("grad_to_block_id");
-  DoubleFindMap<std::string, int32_t> grad_to_block_id;
-
-  auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
-                              const std::string &grad_and_id) {
-    std::vector<std::string> pieces;
-    split(grad_and_id, ':', &pieces);
-    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2,
-                      platform::errors::PreconditionNotMet(
-                          "Invalid format of grad_and_id argument. "
-                          "Expected \"grad:block_id\". Recieved %s",
-                          grad_and_id.c_str()));
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
-                      platform::errors::AlreadyExists(
-                          "The gradient name %s has already existed in out_map",
-                          pieces[0].c_str()));
-
-    int block_id = std::stoi(pieces[1]);
-    (*out_map)[pieces[0]] = block_id;
-  };
-
-  for (const auto &grad_and_id : grad_to_block_id_str) {
-    append_block_maps(&grad_to_block_id, grad_and_id);
-  }
-
-  size_t num_blocks = program->Size();
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::PreconditionNotMet(
-                        "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 2. Recieved %zu",
-                        num_blocks));
-  std::vector<int> block_list;
-  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
-  }
-  auto optimize_prepared = executor->Prepare(*program, block_list);
-  // execute global block if needed, block id 1 in the program is global
-  // block if it's not bind to a grad var for it's update.
-  if (block_list[0] == 1 &&
-      grad_to_block_id.find_value(static_cast<int32_t>(1)) ==
-          grad_to_block_id.end()) {
-    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
-  }
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      grad_to_prepared_ctx, param_to_prepared_ctx;
-  for (size_t i = 0; i < block_list.size(); ++i) {
-    auto blkid = block_list[i];
-    auto it = grad_to_block_id.find_value(blkid);
-    if (it != grad_to_block_id.end()) {
-      grad_to_prepared_ctx[it->first] = optimize_prepared[i];
-    }
-  }
-
-  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-
-  while (true) {
-    if (rpc_service_->IsExit()) {
-      VLOG(4) << "get exit!rpc_processor break!";
-      break;
-    }
-
-    sleep(1);
-  }  // while(true)
-}
-
-static void FillRequestCtx(
-    distributed::RequestHandler *h, framework::Scope *scope,
-    platform::DeviceContext *dev_ctx, framework::Executor *executor,
-    framework::ProgramDesc *program,
-    std::unordered_map<std::string,
-                       std::shared_ptr<framework::ExecutorPrepareContext>>
-        *prefetch_ctx,
-    std::unordered_map<std::string, std::string>
-        *sparse_grad_name_to_param_name,
-    std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
-    std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_ctx,
-    distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(prefetch_ctx);
-  h->SetSparseGradToParam(sparse_grad_name_to_param_name);
-  h->SetRPCServer(rpc_server);
-  h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
-  h->SetLrDecayPreparedCtx(lr_decay_ctx);
-}
-
-void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
-                                    const framework::Scope &scope) const {
-  for (const auto &varname : varnames) {
-    auto var = scope.FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::PreconditionNotMet(
-                 "Received var is not initialized in the received scope."));
-    if (var->IsType<framework::SelectedRows>()) {
-      sparse_vars_.push_back(varname);
-    } else if (var->IsType<framework::LoDTensor>() ||
-               var->IsType<framework::Tensor>()) {
-      dense_vars_.push_back(varname);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor]."));
-    }
-  }
-}
-
-void ListenAndServOp::RunImpl(const framework::Scope &scope,
-                              const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  int distributed_mode = Attr<int>("distributed_mode");
-  bool dc_sgd = Attr<bool>("dc_asgd");
-  auto fan_in = Attr<int>("Fanin");
-  auto pserver_id = Attr<int>("pserver_id");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE_EQ(rpc_service_, nullptr,
-                    platform::errors::PreconditionNotMet(
-                        "RPC service has been created unexpectedly."));
-  std::string endpoint = Attr<std::string>("endpoint");
-  int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
-  int lr_decay_block_id = Attr<int>(kLRDecayBlockId);
-
-  VLOG(4) << "pserver_id: " << pserver_id
-          << ", distributed_mode:" << distributed_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint
-          << ", checkpoint_block_id: " << checkpoint_block_id
-          << ", lr_decay_block_id: " << lr_decay_block_id;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  auto rpc_get_thread_num = Attr<int>("rpc_get_thread_num");
-  auto rpc_send_thread_num = Attr<int>("rpc_send_thread_num");
-  auto rpc_prefetch_thread_num = Attr<int>("rpc_prefetch_thread_num");
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(distributed_mode, dc_sgd));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(distributed_mode, dc_sgd));
-  request_prefetch_handler_.reset(
-      new distributed::RequestPrefetchHandler(distributed_mode));
-  request_checkpoint_handler_.reset(
-      new distributed::RequestCheckpointHandler(distributed_mode));
-  request_get_no_barrier_handler_.reset(
-      new distributed::RequestGetNoBarrierHandler());
-  request_notify_handler_.reset(
-      new distributed::RequestNotifyHandler(distributed_mode, fan_in));
-  request_send_and_recv_handler_.reset(
-      new distributed::RequestSendAndRecvHandler(distributed_mode));
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(), rpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(), rpc_get_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
-                            request_prefetch_handler_.get(),
-                            rpc_prefetch_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
-                            request_checkpoint_handler_.get());
-  rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
-                            request_get_no_barrier_handler_.get());
-  rpc_service_->RegisterRPC(distributed::kRequestNotify,
-                            request_notify_handler_.get(), rpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
-                            request_send_and_recv_handler_.get(),
-                            rpc_get_thread_num);
-
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(optimize_blocks.size(), 1,
-                    platform::errors::PreconditionNotMet(
-                        "optimize blocks is less than 1. Optimize blocks "
-                        "should be 1 at least on the pserver side."));
-  auto *program = optimize_blocks[0]->Program();
-
-  framework::Executor executor(dev_place);
-
-  std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
-  if (checkpoint_block_id != -1) {
-    auto ctx = executor.Prepare(*program, checkpoint_block_id);
-    // see: https://stackoverflow.com/a/14856553
-    ckpt_pre_context = std::move(ctx);
-  }
-
-  std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_context = nullptr;
-  if (lr_decay_block_id != -1) {
-    auto ctx = executor.Prepare(*program, lr_decay_block_id);
-    // see: https://stackoverflow.com/a/14856553
-    lr_decay_context = std::move(ctx);
-  }
-
-  // prepare for prefetch
-  std::vector<int> prefetch_block_id_list;
-  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
-
-  auto prefetch_var_name_to_block_id_str =
-      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
-  for (const auto &prefetch_var_name_and_id :
-       prefetch_var_name_to_block_id_str) {
-    std::vector<std::string> pieces;
-    split(prefetch_var_name_and_id, ':', &pieces);
-    VLOG(3) << "after split, prefetch_var = " << pieces[0]
-            << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 2,
-        platform::errors::PreconditionNotMet(
-            "Invalid format of prefetch_var_name_and_id argument. "
-            "Expected \"xxx:xxx\". Recieved %s",
-            prefetch_var_name_and_id.c_str()));
-
-    int block_id = std::stoi(pieces[1]);
-    prefetch_block_id_list.push_back(block_id);
-    block_id_to_prefetch_var_name[block_id] = pieces[0];
-  }
-
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared_ctx;
-  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
-    auto block_id = prefetch_block_id_list[i];
-    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
-    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
-  }
-
-  // parse attr of kSparseGradToParam  sparse_grad_name -> param_name
-  std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
-  auto sparse_grad_name_to_param_name_str =
-      Attr<std::vector<std::string>>(kSparseGradToParam);
-  for (const auto &sparse_grad_name_and_param_name :
-       sparse_grad_name_to_param_name_str) {
-    std::vector<std::string> pieces;
-    split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 2,
-        platform::errors::PreconditionNotMet(
-            "Invalid format of sparse_grad_name_and_param_name argument. "
-            "Expected \"xxx:xxx\". Recieved %s",
-            sparse_grad_name_and_param_name.c_str()));
-    VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
-            << ", param_name = " << pieces[1];
-    sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
-  }
-
-  auto f =
-      std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
-                &executor, program, &prefetch_var_name_to_prepared_ctx,
-                &sparse_grad_name_to_param_name, ckpt_pre_context,
-                lr_decay_context, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-  f(request_prefetch_handler_.get());
-  f(request_checkpoint_handler_.get());
-  f(request_get_no_barrier_handler_.get());
-  f(request_notify_handler_.get());
-  f(request_send_and_recv_handler_.get());
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, SignalHandler::StopAndExit);
-  signal(SIGTERM, SignalHandler::StopAndExit);
-
-  if (distributed_mode == distributed::DistributedMode::kSync) {
-    // start the server listening after all member initialized.
-    server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    VLOG(3) << "wait server thread to become ready...";
-    rpc_service_->WaitServerReady();
-
-    CacheVarsType(inputs, recv_scope);
-
-    // Write to a file of server selected port for python use.
-    SavePort();
-
-    RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
-                prefetch_block_id_list, checkpoint_block_id);
-  } else {
-    if (distributed_mode == distributed::DistributedMode::kGeo) {
-      distributed::AsyncSparseParamUpdateRecorder::Init(
-          fan_in, sparse_grad_name_to_param_name);
-    }
-
-    VLOG(2) << "RunAsyncLoop";
-    auto grad_to_block_id_str =
-        Attr<std::vector<std::string>>("grad_to_block_id");
-
-    if (grad_to_block_id_str.size() == 0) {
-      VLOG(0) << "there are no gradients on this parameter server";
-    } else {
-      std::vector<std::string> pieces;
-      split(grad_to_block_id_str[0], ':', &pieces);
-      distributed::HeartBeatMonitor::Init(fan_in, pserver_id == 0, pieces[0]);
-    }
-
-    // start the server listening after all member initialized.
-    server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    VLOG(3) << "wait server thread to become ready...";
-    rpc_service_->WaitServerReady();
-
-    // Write to a file of server selected port for python use.
-    SavePort();
-
-    RunAsyncLoop(&executor, program, &recv_scope);
-  }
-}
-
-class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<int>("pserver_id",
-                 "(int, default -1), the parameter server index id")
-        .SetDefault(-1);
-    AddAttr<std::vector<std::string>>(
-        "grad_to_block_id",
-        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
-        "a map from grad name to it's optimize block id")
-        .SetDefault({});
-    AddAttr<int>("distributed_mode",
-                 "indicate distriubte training mode, 0 is sync, 1 is "
-                 "fully-async, 2 is half-async, 3 is geo")
-        .SetDefault(0);
-    AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
-        .SetDefault(false);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
-                                      "prefetch blocks to run on server side.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        kSparseGradToParam,
-        "sparse grad name to param name. like: 'emb@Grad:emb'")
-        .SetDefault({});
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<int>(kCheckpointBlockId,
-                 "BolckID to run save checkpoint on pserer.")
-        .SetDefault(-1);
-    AddAttr<int>(kLRDecayBlockId, "BolckID to run lr decay on pserer.")
-        .SetDefault(-1);
-    AddAttr<int>("rpc_get_thread_num", "pserver get thread num.").SetDefault(1);
-    AddAttr<int>("rpc_send_thread_num", "pserver send thread num.")
-        .SetDefault(1);
-    AddAttr<int>("rpc_prefetch_thread_num", "pserver prefetch thread num.")
-        .SetDefault(1);
-  }
-};
-
-void SignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
-                  ops::ListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
deleted file mode 100644
index bacfd32cc73919..00000000000000
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Executor;
-class ProgramDesc;
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCServer;
-class RequestHandler;
-}  // namespace distributed
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
-constexpr char kCheckpointBlockId[] = "checkpint_block_id";
-constexpr char kLRDecayBlockId[] = "lr_decay_block_id";
-constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
-
-void RunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class ListenAndServOp : public framework::OperatorBase {
- public:
-  ListenAndServOp(const std::string& type,
-                  const framework::VariableNameMap& inputs,
-                  const framework::VariableNameMap& outputs,
-                  const framework::AttributeMap& attrs);
-  virtual ~ListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx,
-                   const std::vector<int>& prefetch_block_id_list,
-                   const int checkpoint_point_block_id) const;
-
-  void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program,
-                    framework::Scope* recv_scope) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void Stop() override;
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
-  void ResetReceivedVars(framework::Scope* recv_scope,
-                         platform::DeviceContext* dev_ctx,
-                         bool reset_all = false) const;
-
-  void CacheVarsType(const std::vector<std::string>& varnames,
-                     const framework::Scope& scope) const;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_get_no_barrier_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_prefetch_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_checkpoint_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_send_and_recv_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class SignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(SignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
deleted file mode 100644
index b8328b88da7d12..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h"
-
-#include <string>
-namespace paddle {
-namespace operators {
-
-class LargeScaleFuseAdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Grad"),
-        platform::errors::InvalidArgument(
-            "Input(Grad) of LargeScaleFuseAdamOp should not be null."));
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        platform::errors::InvalidArgument(
-            "Input(LearningRate) of LargeScaleFuseAdamOp should not be null."));
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 element"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "LearningRate") {
-      return framework::OpKernelType(tensor.type(), tensor.place(),
-                                     tensor.layout());
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class LargeScaleFuseAdamOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto in_var_type = ctx->GetInputType("Grad");
-    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-                          in_var_type == framework::proto::VarType::LOD_TENSOR,
-                      true, platform::errors::InvalidArgument(
-                                "The input Var's type should be LoDtensor or "
-                                "SelectedRows, but the received type is %s",
-                                in_var_type));
-  }
-};
-
-class LargeScaleFuseAdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
-    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddComment(R"DOC(
-Adam Optimizer.
-
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-
-Adam updates:
-
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_fuse_adam, ops::LargeScaleFuseAdamOp,
-    ops::LargeScaleFuseAdamOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::LargeScaleFuseAdamOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_fuse_adam,
-    ops::LargeScaleFuseAdamOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
deleted file mode 100644
index 89b8d54a463b03..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>  // for sqrt in CPU and CUDA
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LargeScaleFuseAdamOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename T>
-class LargeScaleFuseAdamOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using paddle::framework::LoDTensor;
-
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *grad_var = ctx.InputVar("Grad");
-
-    PADDLE_ENFORCE(
-        grad_var->IsType<framework::SelectedRows>(),
-        platform::errors::InvalidArgument(
-            "in large scale optimize, gradient should only be SelectedRows"));
-
-    const auto &grad = grad_var->Get<framework::SelectedRows>();
-
-    // for distributed training, a sparse var may be empty,
-    // just skip updating.
-    if (grad.rows().size() == 0) {
-      return;
-    }
-
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows *grad_merge_ptr;
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    const auto *lr = learning_rate->data<T>();
-    auto grad_v = grad_merge_ptr->value();
-    auto grad_width = grad_v.dims()[1];
-
-    //    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = ctx.Attr<std::string>("tablename");
-    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
-
-    auto *beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
-    auto *beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
-    auto *beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
-    auto *beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-
-    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
-                      platform::errors::InvalidArgument(
-                          "beta1 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta1_pow_out->numel()));
-
-    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
-                      platform::errors::InvalidArgument(
-                          "beta2 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta2_pow_out->numel()));
-
-    // update beta1 and beta2
-    beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-        beta1 * beta1_pow->data<T>()[0];
-    beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-        beta2 * beta2_pow->data<T>()[0];
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    auto *table = ins->Get(tablename);
-    table->Get(in_rows, value_names, &values);
-    table->Dims({"Param"}, &dims);
-
-    PADDLE_ENFORCE_EQ(dims[0], grad_width,
-                      platform::errors::InvalidArgument(
-                          "param_row should have the same size with grad_row"));
-
-    T lr_ = lr[0];
-    T beta1_pow_ = beta1_pow->data<T>()[0];
-    T beta2_pow_ = beta2_pow->data<T>()[0];
-
-    lr_ *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
-
-    for (size_t i = 0; i < in_rows.size(); i++) {
-      auto &params = values[i][0];
-      auto &moment_1 = values[i][1];
-      auto &moment_2 = values[i][2];
-
-      auto *p_data = params->data();
-      auto *m1_data = moment_1->data();
-      auto *m2_data = moment_2->data();
-
-      for (int x = 0; x < grad_width; ++x) {
-        auto g = grad_v.data<T>()[grad_width * i + x];
-        m1_data[x] = beta1 * m1_data[x] + (1 - beta1) * g;
-        m2_data[x] = beta2 * m2_data[x] + (1 - beta2) * g * g;
-        p_data[x] -= lr_ * (m1_data[x] / (sqrt(m2_data[x]) + epsilon));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
deleted file mode 100644
index 8794b87f3ff407..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h"
-
-#include <string>
-namespace paddle {
-namespace operators {
-
-class LargeScaleFuseSGDOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Grad"),
-        platform::errors::InvalidArgument(
-            "Input(Grad) of LargeScaleFuseSGDOp should not be null."));
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        platform::errors::InvalidArgument(
-            "Input(LearningRate) of LargeScaleFuseSGDOp should not be null."));
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 element"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "LearningRate") {
-      return framework::OpKernelType(tensor.type(), tensor.place(),
-                                     tensor.layout());
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class LargeScaleFuseSGDOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto in_var_type = ctx->GetInputType("Grad");
-    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-                          in_var_type == framework::proto::VarType::LOD_TENSOR,
-                      true, platform::errors::InvalidArgument(
-                                "The input Var's type should be LoDtensor or "
-                                "SelectedRows, but the received type is %s",
-                                in_var_type));
-  }
-};
-
-class LargeScaleFuseSGDOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddComment(R"DOC(
-
-LargeScaleFuseSGD operator
-
-This operator implements one step of the stochastic gradient descent algorithm.
-
-$$param\_out = param - learning\_rate * grad$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_fuse_sgd, ops::LargeScaleFuseSGDOp,
-    ops::LargeScaleFuseSGDOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::LargeScaleFuseSGDOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_fuse_sgd,
-    ops::LargeScaleFuseSGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
deleted file mode 100644
index 5d4bf1015fa3a8..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LargeScaleFuseSGDOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename T>
-class LargeScaleFuseSGDOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    const auto *grad_var = ctx.InputVar("Grad");
-
-    PADDLE_ENFORCE(
-        grad_var->IsType<framework::SelectedRows>(),
-        platform::errors::InvalidArgument(
-            "in large scale optimize, gradient should only be SelectedRows"));
-
-    const auto &grad = grad_var->Get<framework::SelectedRows>();
-
-    // for distributed training, a sparse var may be empty,
-    // just skip updating.
-    if (grad.rows().size() == 0) {
-      return;
-    }
-
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows *grad_merge_ptr;
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    const auto *lr = learning_rate->data<T>();
-    auto grad_v = grad_merge_ptr->value();
-    auto grad_width = grad_v.dims()[1];
-
-    //    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = ctx.Attr<std::string>("tablename");
-    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    auto *table = ins->Get(tablename);
-    table->Get(in_rows, value_names, &values);
-    table->Dims({"Param"}, &dims);
-
-    PADDLE_ENFORCE_EQ(dims[0], grad_width,
-                      platform::errors::InvalidArgument(
-                          "param_row should have the same size with grad_row"));
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-    std::vector<T> grads;
-    framework::TensorToVector(grad_v, ctx.device_context(), &grads);
-
-    blas.SCAL(grads.size(), lr[0], grads.data());
-
-    for (int x = 0; x < static_cast<int>(in_rows.size()); ++x) {
-      auto &params = values[x][0];
-      blas.VSUB(grad_width, params->data(), grads.data() + grad_width * x,
-                params->data());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc
deleted file mode 100644
index 9ff2e78d8652d9..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupSparseTableGradSplitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableGradSplitOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddOutput("Row",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.");
-    AddOutput("Value",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_grad_split, ops::LookupSparseTableGradSplitOp,
-    ops::LookupSparseTableGradSplitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_grad_split,
-    ops::LookupSparseTableGradSplitKernel<paddle::platform::CPUDeviceContext,
-                                          float>,
-    ops::LookupSparseTableGradSplitKernel<paddle::platform::CPUDeviceContext,
-                                          double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h
deleted file mode 100644
index b3077efda6de3e..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
-
-template <typename DeviceContext, typename T>
-class LookupSparseTableGradSplitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const SelectedRows* in_grad = context.Input<SelectedRows>("Grad");
-
-    // merge duplicated rows if any.
-    // The rows of grad_merge_ptr have been sorted inside MergeAdd functor
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows* grad_merge_ptr;
-    math::scatter::MergeAdd<DeviceContext, T> merge_func;
-    merge_func(context.template device_context<DeviceContext>(), *in_grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    auto* out_row = context.Output<Tensor>("Row");
-    out_row->Resize(
-        framework::make_ddim({static_cast<int64_t>(in_rows.size()), 1}));
-    out_row->mutable_data<int64_t>(context.GetPlace());
-    framework::TensorFromVector(in_rows, context.device_context(), out_row);
-
-    auto in_value = grad_merge_ptr->value();
-    std::vector<T> ins_vector;
-    framework::TensorToVector(in_value, context.device_context(), &ins_vector);
-    auto dims = in_value.dims();
-
-    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = context.Attr<std::string>("tablename");
-
-    if (is_entry) {
-      auto* ins = distributed::LargeScaleKV::GetInstance();
-      std::vector<int64_t> ids;
-      ins->Get(tablename)->GetEntry(in_rows, &ids);
-
-      for (auto& id : ids) {
-        auto it = std::find(in_rows.begin(), in_rows.end(), id);
-        if (it == in_rows.end()) {
-          PADDLE_THROW(platform::errors::OutOfRange(
-              "the input key should be exists. But received %d.", id));
-        }
-
-        auto distance =
-            static_cast<int64_t>(std::distance(in_rows.begin(), it));
-        std::fill(ins_vector.data() + distance * dims[1],
-                  ins_vector.data() + dims[1], 0.0);
-      }
-    }
-
-    auto* out_v = context.OutputVar("Value");
-    out_v->Clear();
-    auto* out_t = out_v->GetMutable<framework::LoDTensor>();
-    out_t->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(ins_vector, context.device_context(), out_t);
-    out_t->Resize(dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc
deleted file mode 100644
index 96ec6a85d6eab5..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-// examples: embedding:Param,Moment1,Moment2:64,64,64:0
-constexpr char kLargeScaleKV[] = "large_scale_metas";
-constexpr int64_t kNoPadding = -1;
-
-static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-class LookupSparseTableInitInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-void InitLargeScaleKV(std::vector<std::string> kv_attrs) {
-  std::vector<distributed::SparseMeta> metas;
-
-  for (auto attrs : kv_attrs) {
-    std::vector<std::string> pieces;
-    split(attrs, ':', &pieces);
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 8,
-        platform::errors::InvalidArgument(
-            "param, names, dims, mode, grad, cached_var, init_attrs"));
-
-    std::string name;
-    std::string grad_name;
-    std::vector<std::string> value_names;
-    std::vector<int> value_dims;
-    distributed::Mode mode;
-    std::vector<std::string> cached_names;
-    std::vector<std::string> init_attrs;
-    std::string entry_attr;
-
-    name = pieces[0];
-    split(pieces[1], ',', &value_names);
-
-    std::vector<std::string> value_dims_str;
-    split(pieces[2], ',', &value_dims_str);
-    for (auto &str : value_dims_str) {
-      value_dims.push_back(std::stoi(str));
-    }
-
-    mode = pieces[3] == "0" ? distributed::Mode::training
-                            : distributed::Mode::infer;
-
-    grad_name = pieces[4];
-    split(pieces[5], ',', &cached_names);
-    split(pieces[6], ',', &init_attrs);
-    entry_attr = pieces[7];
-
-    auto meta = distributed::SparseMeta();
-    meta.name = name;
-    meta.value_names = value_names;
-    meta.value_dims = value_dims;
-    meta.mode = mode;
-    meta.grad_name = grad_name;
-    meta.cached_varnames = cached_names;
-    meta.initializer_attrs = init_attrs;
-    meta.entry = entry_attr;
-
-    VLOG(3) << "add sparse meta: " << meta.ToString();
-    metas.push_back(meta);
-  }
-
-  distributed::LargeScaleKV::Init(metas);
-  VLOG(3) << "init large scale kv with " << metas.size() << " params";
-}
-
-class LookupSparseTableInitOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto kv_attrs = Attr<std::vector<std::string>>(kLargeScaleKV);
-    InitLargeScaleKV(kv_attrs);
-  }
-};
-
-class LookupSparseTableInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::vector<std::string>>(kLargeScaleKV,
-                                      "(string)"
-                                      "sparse table name");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_init, ops::LookupSparseTableInitOp,
-    ops::LookupSparseTableInitInferShape, ops::LookupSparseTableInitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc
deleted file mode 100644
index 79dc206f040cc5..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupSparseTableMergeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInputs("X"), true,
-        platform::errors::InvalidArgument("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument("Output(Out) should not be null."));
-
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X").front(),
-                      framework::proto::VarType::SELECTED_ROWS,
-                      platform::errors::InvalidArgument(
-                          "Input X only should be SelectedRows."));
-    PADDLE_ENFORCE_EQ(ctx->GetOutputsVarType("Out").front(),
-                      framework::proto::VarType::SELECTED_ROWS,
-                      platform::errors::InvalidArgument(
-                          "Output Y only should be SelectedRows."));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-  }
-};
-
-class LookupSparseTableMergeMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input type is SelectedRows, and the selected rows may be "
-             "duplicated.")
-        .AsDuplicable();
-    AddOutput("Out",
-              "The output type is SelectedRows, and the selected rows are not "
-              "duplicated.");
-    AddComment(
-        R"DOC(
-Merge sparse lookup table(selected rows as parameter).
-)DOC");
-  }
-};
-
-class LookupSparseTableMergeOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
-      const override {
-    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
-    return m;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OPERATOR(lookup_sparse_table_merge, ops::LookupSparseTableMergeOp,
-                  ops::LookupSparseTableMergeMaker,
-                  ops::LookupSparseTableMergeOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_merge,
-    ops::LookupSparseTableMergeKernel<plat::CPUDeviceContext, float>,
-    ops::LookupSparseTableMergeKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h
deleted file mode 100644
index 0efd5cada1c93e..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *     http://www.apache.org/licenses/LICENSE-2.0
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-int64_t GetDelimiterForShard(const std::vector<int64_t>& rows, int start_idx,
-                             int shard_id, int shard_num) {
-  int64_t rows_num = rows.size() / 2;
-  for (int64_t i = start_idx; i < rows_num; ++i) {
-    if (rows[i] % shard_num != shard_id) {
-      return i;
-    }
-  }
-  return rows_num;
-}
-
-template <typename DeviceContext, typename T>
-class LookupSparseTableMergeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto inputs = ctx.MultiInput<framework::SelectedRows>("X");
-    auto* out = ctx.Output<framework::SelectedRows>("Out");
-
-    int64_t height = 0;
-    int64_t ids_num = 0;
-    int64_t width = 0;
-
-    height = inputs[0]->height();
-    width = inputs[0]->value().dims()[1];
-
-    for (auto& in : inputs) {
-      ids_num += in->rows().size();
-      height += in->height();
-    }
-
-    T* out_data = out->mutable_value()->mutable_data<T>({ids_num, width},
-                                                        platform::CPUPlace());
-
-    out->set_height(height);
-    std::vector<int64_t> all_ids;
-    all_ids.reserve(ids_num);
-    for (auto& in : inputs) {
-      all_ids.insert(all_ids.end(), in->rows().begin(), in->rows().end());
-    }
-    out->set_rows(all_ids);
-
-    int64_t cnt = 0;
-
-    for (auto& in : inputs) {
-      auto rows = in->rows().size();
-      const T* in_data = in->value().data<T>();
-      std::copy_n(in_data, rows * width, out_data + cnt);
-      cnt += rows * width;
-    }
-    out->SyncIndex();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc
deleted file mode 100644
index 87a37c5bfdefaa..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-class LookupSparseTableReadInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableReadOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto init = Attr<bool>("init");
-
-    auto &id_tensor = scope.FindVar(Input("Ids"))->Get<framework::LoDTensor>();
-    auto *id_data = id_tensor.data<int64_t>();
-    auto tablename = Attr<std::string>("tablename");
-    auto value_names = Attr<std::vector<std::string>>("value_names");
-    auto out_names = Outputs("Out");
-
-    std::vector<int64_t> ids;
-    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
-      ids.push_back(id_data[i]);
-    }
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-
-    if (init) {
-      ins->Get(tablename)->Init(ids);
-      ins->Get(tablename)->Get(ids, value_names, &values);
-    } else {
-      ins->Get(tablename)->Get(ids, value_names, &values);
-    }
-
-    ins->Get(tablename)->Dims(value_names, &dims);
-
-    platform::CPUPlace cpu;
-    std::vector<float *> tensors;
-
-    for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
-      auto out_var = scope.FindVar(out_names[i]);
-      auto out_t = out_var->GetMutable<framework::LoDTensor>();
-
-      std::vector<int64_t> o_dims;
-      o_dims.push_back(static_cast<int64_t>(ids.size()));
-      o_dims.push_back(dims[i]);
-      out_t->Resize(framework::make_ddim(o_dims));
-      auto *out_d = out_t->mutable_data<float>(cpu);
-      tensors.push_back(out_d);
-    }
-
-    for (int i = 0; i < static_cast<int>(values.size()); i++) {
-      for (int j = 0; j < static_cast<int>(tensors.size()); j++) {
-        std::memcpy(tensors[j] + i * dims[j], values[i][j]->data(),
-                    sizeof(float) * dims[j]);
-      }
-    }
-  }
-};
-
-class LookupSparseTableReadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.");
-    AddOutput("Out",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddAttr<bool>("init", " for test init large scale kv").SetDefault(false);
-
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_read, ops::LookupSparseTableReadOp,
-    ops::LookupSparseTableReadInferShape, ops::LookupSparseTableReadOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc
deleted file mode 100644
index afe79cd1c316c6..00000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-class LookupSparseTableWriteInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableWriteOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &id_tensor = scope.FindVar(Input("Ids"))->Get<framework::LoDTensor>();
-    auto *id_data = id_tensor.data<int64_t>();
-
-    std::vector<int64_t> ids;
-    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
-      ids.push_back(id_data[i]);
-    }
-
-    auto tablename = Attr<std::string>("tablename");
-    auto value_names = Attr<std::vector<std::string>>("value_names");
-
-    std::vector<const float *> tensors;
-    std::vector<int64_t> dims;
-    std::vector<std::vector<std::vector<float>>> values;
-    values.resize(ids.size());
-
-    auto in_names = Inputs("In");
-    for (int i = 0; i < static_cast<int>(in_names.size()); i++) {
-      auto *in = scope.FindVar(in_names[i]);
-      auto in_t = in->Get<framework::LoDTensor>();
-      dims.push_back(in_t.dims()[1]);
-      tensors.push_back(in_t.data<float>());
-    }
-
-    for (int i = 0; i < static_cast<int>(ids.size()); i++) {
-      values[i].resize(tensors.size());
-      for (int j = 0; j < static_cast<int>(tensors.size()); j++) {
-        values[i][j].resize(dims[j]);
-        std::memcpy(values[i][j].data(), tensors[j] + i * dims[j],
-                    sizeof(float) * dims[j]);
-      }
-    }
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    ins->Get(tablename)->Set(ids, value_names, values);
-  }
-};
-
-class LookupSparseTableWriteOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.");
-    AddInput("In",
-             "(LoDTensor) The lookup results, which have the "
-             "same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_write, ops::LookupSparseTableWriteOp,
-    ops::LookupSparseTableWriteInferShape, ops::LookupSparseTableWriteOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
deleted file mode 100644
index 33a433b0dbe04b..00000000000000
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/merge_ids_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
-        .AsDuplicable();
-    AddInput("Rows", "(LoDTensor) the input ids with shape{row_size, 1}, ")
-        .AsDuplicable();
-    AddInput("X",
-             "(LoDTensors) multi input tensor with shape{Rows, N}, N is the "
-             "size of embedding table")
-        .AsDuplicable();
-    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.")
-        .AsDuplicable();
-
-    AddComment(R"DOC(
-Merge multi LoDTensor's into one according to Ids's shard num.
-
-
-split_ids_op -> prefetch_op -> merge_ids_op
-
-
-merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
- will split input Ids into multiple tensors according to Id's shard number.
-prefetch_op will send them to parameter server to prefetch embedding value
-back. During split, the order of ids is disordered. In merge_ids_op we use
-the original Ids to restore the order of the fetched embedding value and
- also pass the lod information to the merged output.
-
-
-Example:
-
-    Ids = [1,2,3,4,5,6] # 3 shared
-
-split_ids_op ->
-
-    Id0 = [3, 6] # id % 3 == 0
-    Id1 = [1, 4] # id % 3 == 1
-    Id2 = [2, 5] # id % 3 == 2
-
-prefetch_op ->
-
-    X0 = [[0.3 0.3]   # 3
-          [0.6 0.6]]  # 6
-    X1 = [[0.1 0.1]   # 1
-          [0.4 0.4]]  # 4
-    X2 = [[0.2 0.2]   # 2
-          [0.5 0.5]]  # 5
-
-merge_ids_op ->
-
-    Out = [[0.1 0.1]  # 1
-           [0.2 0.2]  # 2
-           [0.3 0.3]  # 3
-           [0.4 0.4]  # 4
-           [0.5 0.5]  # 5
-           [0.6 0.6]] # 6
-)DOC");
-  }
-};
-
-class MergeIdsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("Ids"), "Input", "Ids", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasInputs("Rows"), "Input", "Rows", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MergeIds");
-
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0].size(), 2,
-          platform::errors::InvalidArgument(
-              "the ids size must be 2, but received %d", ids_dims[0].size()));
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0][1], 1,
-          platform::errors::InvalidArgument(
-              "the ids dim must be 1, but received %d", ids_dims[0][1]));
-    }
-    auto x_var_type = ctx->GetInputsVarType("X");
-    for (auto &var_type : x_var_type) {
-      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
-                        platform::errors::InvalidArgument(
-                            "input X only support lod tensors"));
-    }
-    ctx->ShareLoD("Ids", "Out");
-  }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class MergeIdsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetInputType("Ids");
-    ctx->SetOutputType("Out", input_type, framework::ALL_ELEMENTS);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
-                  ops::MergeIdsOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
deleted file mode 100644
index 9af014f57a6875..00000000000000
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MergeIdsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    if (!platform::is_cpu_place(place)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "MergeIds do not support GPU kernel"));
-    }
-
-    const auto ids = ctx.MultiInput<framework::LoDTensor>("Ids");
-    const auto row_ids = ctx.MultiInput<framework::LoDTensor>("Rows");
-    const auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-
-    PADDLE_ENFORCE_EQ(row_ids.size(), x_tensors.size(),
-                      platform::errors::InvalidArgument(
-                          "the number of Rows and X should be the same"));
-    PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
-                      platform::errors::InvalidArgument(
-                          "the number of Ids and Out should be the same"));
-
-    int64_t row_ids_size = 0;
-    int64_t row_size = 0;
-    int64_t embedding_size = 0;
-
-    for (size_t i = 0; i < x_tensors.size(); ++i) {
-      const auto *x_tensor = x_tensors[i];
-      const auto *row_id = row_ids[i];
-
-      if (embedding_size == 0) {
-        embedding_size = x_tensor->dims()[1];
-      }
-      PADDLE_ENFORCE_EQ(embedding_size, x_tensor->dims()[1],
-                        platform::errors::InvalidArgument(
-                            "embedding size of all input should be the same"));
-      row_size += x_tensor->dims()[0];
-      row_ids_size += row_id->dims()[0];
-    }
-
-    PADDLE_ENFORCE_EQ(
-        row_size, row_ids_size,
-        platform::errors::InvalidArgument(
-            "the merged X dim[0] and merged Rows dim[0] should be the same"));
-
-    std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
-        selected_rows_idx_map;
-    for (size_t i = 0; i < x_tensors.size(); ++i) {
-      const auto *row_id = row_ids[i];
-
-      for (auto j = 0; j < row_id->numel(); ++j) {
-        int64_t key = row_id->data<int64_t>()[j];
-        std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
-        selected_rows_idx_map.insert(std::make_pair(key, val));
-      }
-    }
-    PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
-                      platform::errors::InvalidArgument(
-                          "the rows and tensor map size should be the same"));
-
-    for (size_t i = 0; i < outs.size(); ++i) {
-      auto *out_ids = ids[i];
-      auto *out = outs[i];
-
-      out->set_lod(out_ids->lod());
-
-      auto nums = out_ids->dims()[0];
-      auto *out_data = out->mutable_data<T>(
-          framework::make_ddim({nums, embedding_size}), place);
-      for (auto j = 0; j < nums; ++j) {
-        auto id = out_ids->data<int64_t>()[j];
-        auto row_tuple = selected_rows_idx_map.at(id);
-        auto row_idx = std::get<1>(row_tuple);
-        const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
-
-        memcpy(out_data + embedding_size * j,
-               x_tensor->data<T>() + row_idx * embedding_size,
-               sizeof(T) * embedding_size);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
deleted file mode 100644
index 007dbbbfbf5105..00000000000000
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class PrefetchOp : public framework::OperatorBase {
- public:
-  PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-    auto outs = Outputs("Out");
-
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < ins.size(); i++) {
-      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-                << outs[i] << " back";
-        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
-                                                    ins[i], outs[i]));
-      } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
-      }
-    }
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_EQ(
-          rets[i]->Wait(), true,
-          platform::errors::Fatal(
-              "It's a fatal error of RPCClient that RPCClient can't "
-              "get the wait result. It may happen when trainers or "
-              "parameter servers exit un normally or the network "
-              "issue!"));
-    }
-  }
-};
-
-class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
-    AddOutput("Out",
-              "(LoDTensor) result "
-              "to be fetched from parameter server")
-        .AsDuplicable();
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddComment(R"DOC(
-Prefetch operator
-
-This operator will send Ids variables to listen_and_serve op at
-the parameter server and fetch result back.
-)DOC");
-  }
-};
-
-class PrefetchOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    prefetch, ops::PrefetchOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::PrefetchOpMaker, ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
deleted file mode 100644
index 9729d0dadd7ed8..00000000000000
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class RecvOp : public framework::OperatorBase {
- public:
-  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> varnames =
-        Attr<std::vector<std::string>>("varnames");
-
-    auto outs = Outputs("Out");
-    bool with_barrier = Attr<bool>("with_barrier");
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(place);
-    auto trainer_id = Attr<int>("trainer_id");
-
-    distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-    std::vector<std::string> recv_varnames =
-        Attr<std::vector<std::string>>("recv_varnames");
-
-    if (recv_varnames.size() > 0) {
-      auto *communicator = distributed::Communicator::GetInstance();
-
-      if (communicator != nullptr) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "execute startup program must before fleet.init_worker"));
-      }
-    } else {
-      std::vector<distributed::VarHandlePtr> rets;
-      if (with_barrier) {
-        for (size_t i = 0; i < outs.size(); i++) {
-          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                  << varname << " and with AsyncGetVar";
-          rets.push_back(
-              rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
-        }
-      } else {
-        for (size_t i = 0; i < outs.size(); i++) {
-          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                  << varname << " and with AsyncGetVarNoBarrier";
-          rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
-                                                          varname, outs[i]));
-        }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_recv " << outs[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(
-            rets[i]->Wait(), 0U,
-            platform::errors::ExecutionTimeout("internal error in RPCClient"));
-        VLOG(7) << "after sync_recv " << outs[i] << "from " << epmap[i];
-      }
-    }
-  }
-};
-
-class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDuplicable();
-    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
-    AddComment(R"DOC(
-Recv operator
-
-This operator can get variables from server side.
-)DOC");
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<bool>("with_barrier",
-                  "(bool, default True) if with_barrier=False, will use "
-                  "AsyncGetVarNoBarrier get variable from pserver immediately")
-        .SetDefault(true);
-    AddAttr<std::vector<std::string>>(
-        "varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "recv_varnames",
-        "(vector<string>) "
-        "the split parameter varnames to be recved from pserver")
-        .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("do_not_run", "if recv need to really run").SetDefault(0);
-  }
-};
-
-class RecvOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    recv, ops::RecvOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::RecvOpMaker, ops::RecvOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
deleted file mode 100644
index d6da818e1df51d..00000000000000
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ /dev/null
@@ -1,328 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <fstream>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-class RecvSaveOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        platform::CPUPlace());
-  }
-};
-
-class RecvSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddComment(R"DOC(
-Recv Save operator
-
-This operator will serialize and write LoDTensor variable to file on disk.
-)DOC");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-
-    AddAttr<bool>("overwrite",
-                  "(boolean, default true)"
-                  "Overwrite the output file if exist")
-        .SetDefault(true);
-
-    AddAttr<std::string>("file_path",
-                         "(string)"
-                         "The \"file_path\" where the variable will be saved.")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>(
-        "slice_varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>(
-        "remote_varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>("slice_shapes",
-                                      "(vector<int>) "
-                                      "the length of each output along the "
-                                      "specified axis.")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<bool>("is_sparse", "sparse or dense param");
-    AddAttr<int>("pserver_num", "the number of pserver").SetDefault(0);
-    AddAttr<bool>("is_distributed", "sparse id range [0, N) or [0, INT64]")
-        .SetDefault(false);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RecvSaveOpKernel : public framework::OpKernel<T> {
- private:
-  void SerializeVersionToStream(std::ostream &os) const {
-    {  // the 1st field, uint32_t version for LoDTensor
-      os.write(reinterpret_cast<const char *>(&framework::kCurTensorVersion),
-               sizeof(framework::kCurTensorVersion));
-    }
-    // the 2st field, LoD information
-    // in this scene, skip LoD information.
-    uint64_t size = 0;
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-  }
-
-  void SerializeTensorHeaderToStream(
-      std::ostream &os, const framework::proto::VarType::Type &type,
-      const framework::DDim &dims) const {
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-      // int32_t  size
-      // void*    protobuf message
-      framework::proto::VarType::TensorDesc desc;
-      desc.set_data_type(type);
-      auto tensor_dims = framework::vectorize(dims);
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(tensor_dims.size()), 0);
-      std::copy(tensor_dims.begin(), tensor_dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      os.write(out.data(), size);
-    }
-  }
-
-  void SerializeTensorAppendToStream(std::ostream &os,
-                                     const framework::Tensor &tensor) const {
-    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
-    auto *data_ptr = tensor.data<void>();
-
-    PADDLE_ENFORCE_LT(size, std::numeric_limits<std::streamsize>::max(),
-                      platform::errors::ResourceExhausted(
-                          "tensor size %d overflow when writing tensor", size));
-    os.write(static_cast<const char *>(data_ptr),
-             static_cast<std::streamsize>(size));
-  }
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto filename = ctx.Attr<std::string>("file_path");
-    auto overwrite = ctx.Attr<bool>("overwrite");
-
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW(platform::errors::AlreadyExists(
-          "%s is existed, cannot save to it when overwrite=false", filename));
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto origin_shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto slice_shapes = ctx.Attr<std::vector<std::string>>("slice_shapes");
-    auto slice_varnames = ctx.Attr<std::vector<std::string>>("slice_varnames");
-    auto remote_varnames =
-        ctx.Attr<std::vector<std::string>>("remote_varnames");
-    auto endpoints = ctx.Attr<std::vector<std::string>>("endpoints");
-
-    auto trainer_id = ctx.Attr<int>("trainer_id");
-    auto is_sparse = ctx.Attr<bool>("is_sparse");
-    auto pserver_num = ctx.Attr<int>("pserver_num");
-    // auto is_distributed = ctx.Attr<int>("is_distributed");
-
-    PADDLE_ENFORCE_EQ(slice_shapes.size(), slice_varnames.size(),
-                      platform::errors::InvalidArgument(
-                          "Expected attr len(slice_shapes) must be equal to "
-                          "len(slice_varnames)"));
-
-    PADDLE_ENFORCE_EQ(
-        slice_shapes.size(), endpoints.size(),
-        platform::errors::InvalidArgument(
-            "Expected attr len(slice_shapes) must be equal to len(endpoints)"));
-
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE_EQ(
-        static_cast<bool>(fout), true,
-        platform::errors::NotFound("Cannot open %s to write", filename));
-
-    SerializeVersionToStream(fout);
-    SerializeTensorHeaderToStream(fout, data_type,
-                                  framework::make_ddim(origin_shape));
-
-    framework::Scope &local_scope = ctx.scope().NewScope();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto place = ctx.GetPlace();
-    auto &device_ctx = *pool.Get(place);
-
-    distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-    if (!is_sparse) {
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        auto &varname = slice_varnames[i];
-        auto *var = local_scope.Var(varname);
-        auto *tensor = var->GetMutable<framework::LoDTensor>();
-
-        auto slice_string =
-            string::split_string<std::string>(slice_shapes[i], ",");
-        std::vector<int64_t> slice_shape;
-
-        for (auto &dim : slice_string) {
-          slice_shape.push_back(static_cast<int64_t>(std::stoull(dim)));
-        }
-
-        tensor->Resize(framework::make_ddim(slice_shape));
-
-        distributed::VarHandlePtr ret;
-
-        ret = rpc_client->AsyncGetVarNoBarrier(
-            endpoints[i], device_ctx, local_scope, remote_varnames[i], varname);
-
-        PADDLE_ENFORCE_NE(
-            ret->Wait(), 0U,
-            platform::errors::ExecutionTimeout(
-                "rpc error when communication with %s", endpoints[i]));
-
-        auto &c_tensor = var->Get<framework::LoDTensor>();
-
-        SerializeTensorAppendToStream(fout, c_tensor);
-        local_scope.EraseVars({varname});
-      }
-    } else {
-      PADDLE_ENFORCE_GT(
-          pserver_num, 0,
-          platform::errors::InvalidArgument(
-              "Expected attr len(pserver_num) must gather than 0"));
-
-      std::vector<std::string> varnames;
-      auto *var = local_scope.Var("tmp_for_sparse_merge");
-      auto *o_t = var->GetMutable<framework::LoDTensor>();
-      o_t->Resize(framework::make_ddim(origin_shape));
-      auto *out_d = o_t->mutable_data<float>(place);
-
-      varnames.push_back("tmp_for_sparse_merge");
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        varnames.push_back(slice_varnames[i]);
-      }
-
-      std::vector<const float *> tensors;
-
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        auto &varname = slice_varnames[i];
-        auto *local_var = local_scope.Var(varname);
-        auto *tensor = local_var->GetMutable<framework::LoDTensor>();
-
-        auto slice_string =
-            string::split_string<std::string>(slice_shapes[i], ",");
-        std::vector<int64_t> slice_shape;
-
-        for (auto &dim : slice_string) {
-          slice_shape.push_back(static_cast<int64_t>(std::stoull(dim)));
-        }
-
-        tensor->Resize(framework::make_ddim(slice_shape));
-
-        distributed::VarHandlePtr ret;
-
-        ret = rpc_client->AsyncGetVarNoBarrier(
-            endpoints[i], device_ctx, local_scope, remote_varnames[i], varname);
-
-        PADDLE_ENFORCE_NE(
-            ret->Wait(), 0U,
-            platform::errors::ExecutionTimeout(
-                "rpc error when communication with %s", endpoints[i]));
-
-        const auto *value =
-            local_var->Get<framework::LoDTensor>().data<float>();
-        tensors.push_back(value);
-      }
-
-      auto dims1 = origin_shape[1];
-      for (int j = 0; j < origin_shape[0]; ++j) {
-        auto id = j % pserver_num;
-        auto idx = j / pserver_num;
-        std::memcpy(out_d + j * dims1, tensors[id] + idx * dims1,
-                    sizeof(float) * dims1);
-      }
-
-      auto &c_tensor = var->Get<framework::LoDTensor>();
-      SerializeTensorAppendToStream(fout, c_tensor);
-
-      local_scope.EraseVars(varnames);
-    }
-
-    fout.close();
-    ctx.scope().DeleteScope(&local_scope);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(recv_save, ops::RecvSaveOp, ops::RecvSaveOpProtoMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    recv_save, ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
deleted file mode 100644
index 4727b3bb249de8..00000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class RefByTrainerIdOp : public framework::OperatorWithKernel {
- public:
-  RefByTrainerIdOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("TrainerId"), true,
-        platform::errors::InvalidArgument(
-            "Input(TrainerId) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument(
-            "Output(Out) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("TrainerId").size(), 1,
-        platform::errors::InvalidArgument("TrainerId should be a scalar."));
-    // Out's shape is determined at runtime.
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class RefByTrainerIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor list.").AsDuplicable();
-    AddInput("TrainerId", "(Tensor) Scalar int, the trainer id runtime value.");
-    AddOutput("Out", "(Tensor) Return one tensor reference of X[trainer_id]");
-    AddComment(R"DOC(
-**RefByTrainerId operator**
-
-Return a reference of a tensor, using trainer_id as the index to find from the input.
-
-$$Out = X[TrainerId]$$
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(ref_by_trainer_id, ops::RefByTrainerIdOp,
-                             ops::RefByTrainerIdOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ref_by_trainer_id,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
deleted file mode 100644
index 168cd51355de56..00000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    ref_by_trainer_id,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            float>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            double>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            int>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
deleted file mode 100644
index c8c437c4965e74..00000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class RefByTrainerIdKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto in_list = context.MultiInput<framework::Tensor>("X");
-    auto* trainer_id_t = context.Input<framework::Tensor>("TrainerId");
-    int64_t trainer_id = 0;
-    auto* trainer_id_data = trainer_id_t->data<int64_t>();
-    if (platform::is_gpu_place(context.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto stream = context.cuda_device_context().stream();
-      memory::Copy<>(platform::CPUPlace(), &trainer_id,
-                     BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     trainer_id_data, sizeof(int64_t), stream);
-#endif
-    } else {
-      trainer_id = *trainer_id_data;
-    }
-    PADDLE_ENFORCE_LT((size_t)trainer_id, in_list.size(),
-                      platform::errors::InvalidArgument(
-                          "X' size must >= TrainerId: [%s], but received [%s]",
-                          trainer_id, in_list.size()));
-    out->mutable_data<T>(context.GetPlace());
-    framework::TensorCopy(*(in_list[trainer_id]), in_list[trainer_id]->place(),
-                          out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
deleted file mode 100644
index 00cdbe70ca47e6..00000000000000
--- a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SendAndRecvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& scope = ctx.scope();
-    const auto& place = ctx.GetPlace();
-    auto send_var_name = ctx.Attr<std::string>("send_var_name");
-    auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
-    auto epmap = ctx.Attr<std::string>("endpoint");
-    auto trainer_id = ctx.Attr<int>("trainer_id");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& context = *pool.Get(place);
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-    VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
-            << " Recv_var_name: " << recv_var_name;
-    distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
-        epmap, context, scope, send_var_name, recv_var_name);
-    rets->Wait();
-  }
-};
-
-class SendAndRecvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(data_type, platform::CPUPlace());
-  }
-};
-
-class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
-    AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
-    AddAttr<std::string>("send_var_name", "Send Tensor's name")
-        .SetDefault(std::string(""));
-    AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
-        .SetDefault(std::string(""));
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::string>("endpoint", "Server endpoint")
-        .SetDefault({"127.0.0.1:6164"});
-    AddComment(R"DOC(
-    SendAndRecv operator
-    This operator will send variables to listen_and_serve op at the parameter server.
-    And recv variable from parameter server of send variable's scope.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    send_and_recv,
-    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
deleted file mode 100644
index 5aa2ba26aa4d6e..00000000000000
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class SendBarrierOp : public framework::OperatorBase {
- public:
-  SendBarrierOp(const std::string& type,
-                const framework::VariableNameMap& inputs,
-                const framework::VariableNameMap& outputs,
-                const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto is_half_async = Attr<bool>("half_async");
-
-    if (is_half_async) {
-      distributed::Communicator::GetInstance()->Barrier();
-      return;
-    }
-
-    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    VLOG(3) << "SendBarrierOp sync";
-
-    std::vector<distributed::VarHandlePtr> rets;
-
-    for (auto& ep : eps) {
-      VLOG(3) << "send barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendBatchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(
-          rets[i]->Wait(), 0U,
-          platform::errors::ExecutionTimeout("internal error in RPCClient"));
-    }
-  }
-};
-
-class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SendBarrier operator
-
-This operator will send a send barrier signal to list_and_serv op, so that
-the Parameter Server would knew all variables have been sent.
-)DOC");
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<bool>(
-        "half_async",
-        "(bool, default false)"
-        "half_async=True is for half_async mode, this will send signal "
-        "to HalfAsyncCommunicator Instance")
-        .SetDefault(false);
-  }
-};
-
-class SendBarrierOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    send_barrier, ops::SendBarrierOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::SendBarrierOpMaker, ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
deleted file mode 100644
index a4192c18afae50..00000000000000
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class SendOp : public framework::OperatorBase {
- public:
-  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-
-    auto epmap = Attr<std::vector<std::string>>("endpoints");
-    auto trainer_id = Attr<int>("trainer_id");
-
-    auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
-    auto height_sections = Attr<std::vector<int64_t>>("sections");
-    auto use_send_handler = Attr<bool>("use_send_handler");
-
-    if (send_varnames.size() > 0) {
-      distributed::Communicator::GetInstance()->Send(ins, send_varnames, scope);
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      auto& ctx = *pool.Get(place);
-
-      distributed::RPCClient* rpc_client =
-          distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-      std::vector<distributed::VarHandlePtr> rets;
-      if (use_send_handler) {
-        for (size_t i = 0; i < ins.size(); i++) {
-          if (NeedSend(scope, ins[i])) {
-            VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-            rets.push_back(
-                rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
-          } else {
-            VLOG(3) << "don't send no-initialied variable: " << ins[i];
-          }
-        }
-      } else {
-        for (size_t i = 0; i < ins.size(); i++) {
-          for (size_t j = 0; j < epmap.size(); j++) {
-            if (NeedSend(scope, ins[i])) {
-              VLOG(3) << "sending " << ins[i] << " to " << epmap[j];
-              rets.push_back(rpc_client->AsyncDistributeNotify(epmap[j], ctx,
-                                                               scope, ins[i]));
-            } else {
-              VLOG(3) << "don't send no-initialied variable: " << ins[i];
-            }
-          }
-        }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(
-            rets[i]->Wait(), 0U,
-            platform::errors::ExecutionTimeout("internal error in RPCClient"));
-        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
-      }
-    }
-  }
-};
-
-class SendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Send operator
-
-This operator will send variables to listen_and_serve op at the parameter server.
-)DOC");
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<std::vector<int64_t>>("sections",
-                                  "(vector<int>) "
-                                  "the length of each output along the "
-                                  "specified axis.")
-        .SetDefault(std::vector<int64_t>{});
-    AddAttr<std::vector<std::string>>(
-        "send_varnames",
-        "(vector<string>) "
-        "the split output varnames to send to pserver")
-        .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-    AddAttr<bool>("merge_add",
-                  "(bool, default 0)"
-                  "merge method, true represent add, false represent average")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "use_send_handler",
-        "(bool, default 1)"
-        "if it's true, use send handler, other wise, use notify handler")
-        .SetDefault(true);
-  }
-};
-
-class SendOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    send, ops::SendOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::SendOpMaker, ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc b/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
deleted file mode 100644
index 1f8e05a4719837..00000000000000
--- a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/string/printf.h"
-
-USE_NO_KERNEL_OP(send);
-USE_NO_KERNEL_OP(listen_and_serv);
-USE_OP(sum);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-namespace d = paddle::operators::distributed
-
-    // global for simplicity.
-    std::unique_ptr<f::OperatorBase>
-        listen_and_serv_op;
-int selected_port;
-
-void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
-  p::CPUDeviceContext ctx(place);
-  for (int i = 0; i < 2; ++i) {
-    auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope->Var(var_name);
-    auto tensor = var->GetMutable<f::LoDTensor>();
-    tensor->Resize({10, 10});
-    float *expect = tensor->mutable_data<float>(place);
-    for (int64_t i = 0; i < tensor->numel(); ++i) {
-      expect[i] = static_cast<float>(i);
-    }
-  }
-
-  auto out_var = scope->Var("Out");
-  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
-  out_tensor->Resize({10, 10});
-  out_tensor->mutable_data<float>(place);  // allocate
-}
-
-void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
-  p::CPUDeviceContext ctx(place);
-  int64_t height = 10;
-  int64_t row_numel = 10;
-  m::SetConstant<p::CPUDeviceContext, float> set_one;
-  // init x0
-  std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope->Var("x0");
-  auto x0 = x0_var->GetMutable<f::SelectedRows>();
-  x0->set_rows(rows0);
-  x0->set_height(height);
-  auto x0_value = x0->mutable_value();
-  x0_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
-  set_one(ctx, x0_value, 1.0);
-
-  // init x1
-  std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope->Var("x1");
-  auto x1 = x1_var->GetMutable<f::SelectedRows>();
-  x1->set_rows(rows1);
-  x1->set_height(height);
-  auto x1_value = x1->mutable_value();
-  x1_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
-  set_one(ctx, x1_value, 1.0);
-
-  auto out_var = scope->Var("Out");
-  auto out = out_var->GetMutable<f::SelectedRows>();
-  auto out_value = out->mutable_value();
-  out->set_height(height);
-  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-}
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           f::BlockDesc *block, bool is_sparse) {
-  // insert output
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(f::proto::VarType::FP32);
-      var->SetPersistable(true);
-      if (is_sparse) {
-        var->SetType(f::proto::VarType::SELECTED_ROWS);
-      }
-    }
-  }
-
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
-  f::Scope scope;
-  p::CPUPlace place;
-  VLOG(4) << "before init tensor";
-  if (is_sparse) {
-    InitSelectedRowsInScope(place, &scope);
-  } else {
-    InitTensorsInScope(place, &scope);
-  }
-  // sub program run in listen_and_serv_op, for simple test we use sum
-  f::ProgramDesc program;
-  const auto &root_block = program.Block(0);
-  std::vector<framework::BlockDesc *> optimize_blocks;
-  auto *optimize_block = program.AppendBlock(root_block);
-  optimize_blocks.push_back(optimize_block);
-
-  auto *prefetch_block = program.AppendBlock(root_block);
-  // X for server side tensors, RX for received tensors, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
-        is_sparse);
-  f::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:0")});
-  attrs.insert({"Fanin", 1});
-  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
-  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"optimize_blocks", optimize_blocks});
-  attrs.insert({"PrefetchBlock", prefetch_block});
-  attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
-  attrs.insert({"distributed_mode", d::DistributedMode::kSync});
-  VLOG(4) << "before init op";
-  listen_and_serv_op =
-      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
-  *initialized = true;
-  listen_and_serv_op->Run(scope, place);
-  LOG(INFO) << "server exit";
-}
-
-TEST(SendRecvOp, CPUDense) {
-  std::atomic<bool> initialized{false};
-  std::thread server_thread(StartServerNet, false, &initialized);
-  while (!initialized) {
-  }
-
-  static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
-      ->WaitServerReady();
-
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  InitTensorsInScope(place, &scope);
-  // create rpc client var
-  scope.Var("RPC_CLIENT_VAR");
-
-  f::AttributeMap attrs;
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
-  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
-  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
-  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
-  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  const f::VariableNameMap &inputs = {{"X", {"x1"}}};
-  const f::VariableNameMap &outputs = {{"Out", {"Out"}}};
-
-  auto send_op = f::OpRegistry::CreateOp("send", inputs, outputs, attrs);
-  send_op->Run(scope, place);
-
-  auto in_var = scope.Var("x1");
-  auto tensor = in_var->GetMutable<f::LoDTensor>();
-  float *expected = tensor->data<float>();
-  auto out_var = scope.Var("Out");
-  auto target = out_var->GetMutable<f::LoDTensor>();
-  // x1 * 2 == x0
-  EXPECT_NE(target->memory_size(), size_t(0));
-  float *actual = target->data<float>();
-  for (int64_t i = 0; i < target->numel(); ++i) {
-    EXPECT_EQ(expected[i] * 2, actual[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset(nullptr);
-  paddle::operators::ListenAndServOp::ResetPort();
-}
-
-TEST(SendRecvOp, CPUSparse) {
-  std::atomic<bool> initialized;
-  initialized = false;
-  std::thread server_thread(StartServerNet, true, &initialized);
-  while (!initialized) {
-  }
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
-  listen_and_serv_op_ptr->WaitServerReady();
-
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(place, &scope);
-  scope.Var("RPC_CLIENT_VAR");
-  f::AttributeMap attrs;
-  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
-  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
-  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
-  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
-  send_op->Run(scope, place);
-
-  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
-  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
-  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
-  auto actual = out->mutable_value();
-
-  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
-  auto expect_value = expect->mutable_value();
-  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-
-  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
-  add_functor(ctx, *x0, *x1, expect.get());
-
-  EXPECT_EQ(actual->numel(), expect_value->numel());
-  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
-
-  for (int64_t i = 0; i < expect_value->numel(); ++i) {
-    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
-              actual->mutable_data<float>(place)[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset();
-  paddle::operators::ListenAndServOp::ResetPort();
-}
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
deleted file mode 100644
index 7dc0596ac31e25..00000000000000
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-
-namespace paddle {
-namespace operators {
-
-inline bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  // dummy variable is only used in parallel executor to represent
-  // some dependency relationship, we don't need to send/recv it.
-  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
-  if (varname.find(framework::ir::Node::kControlDepVarName) !=
-      std::string::npos)
-    return false;
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(
-      var, platform::errors::NotFound(
-               "Can not find variable '%s' in the send side.", varname));
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Variable type in send side should be LodTensor or SelectedRows."));
-  }
-  return false;
-}
-
-inline std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
-inline size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc b/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
deleted file mode 100644
index 6cd01089f9bc22..00000000000000
--- a/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void **buf, Tensor *tensor,
-                          const platform::Place &place)
-      : buf_(buf), tensor_(tensor), place_(place) {}
-
-  template <typename T>
-  void apply() {
-    *buf_ = tensor_->mutable_data<T>(place_);
-  }
-
-  void **buf_;
-  Tensor *tensor_;
-  platform::Place place_;
-};
-
-template <typename DeviceContext, typename T>
-class SparseTensorLoadKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    auto filename = ctx.Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
-                      platform::errors::Unavailable(
-                          "Load operator fail to open file %s, please check "
-                          "whether the model file is complete or damaged.",
-                          filename));
-    auto name = ctx.OutputNames("Out")[0];
-    VLOG(4) << "Sparse Load Var name: " << name;
-    auto *out_var = ctx.OutputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::InvalidArgument(
-                     "The variable %s to be loaded cannot be found.", name));
-    PADDLE_ENFORCE_EQ(out_var->IsType<paddle::framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "SparseLoad OP only support LoDTensor"));
-    LoadLodTensor(fin, place, out_var, ctx);
-  }
-
-  void LoadLodTensor(std::istream &is, const platform::Place &place,
-                     paddle::framework::Variable *var,
-                     const paddle::framework::ExecutionContext &ctx) const {
-    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
-
-    auto node_index = ctx.Attr<int64_t>("node_index");
-    auto node_num = ctx.Attr<int64_t>("node_num");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    VLOG(4) << "Sparse LoadLodTensor node_num" << node_num;
-    VLOG(4) << "Sparse LoadLodTensor node_index" << node_index;
-    VLOG(4) << "Sparse LoadLodTensor shape[0]" << shape[0];
-    PADDLE_ENFORCE_GE(node_index, 0, platform::errors::InvalidArgument(
-                                         "node_num great than or equal to 0"));
-    PADDLE_ENFORCE_GE(node_num, 1, platform::errors::InvalidArgument(
-                                       "node_num great than or equal to 1"));
-
-    {
-      // the 1st field, unit32_t version for LoDTensor
-      uint32_t version;
-      is.read(reinterpret_cast<char *>(&version), sizeof(version));
-      PADDLE_ENFORCE_EQ(paddle::framework::IsTensorVersionSupported(version),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Tensor version %u is not supported.", version));
-      PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
-                                         "Tensor version %u is not supported, "
-                                         "only version 0 is supported.",
-                                         version));
-    }
-
-    {
-      // the 2st field, LoD information
-      // Todo sparse load need change LoDTensor's lod level
-      uint64_t lod_level;
-      is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-    }
-
-    // the 3st filed, Tensor
-
-    uint32_t version;
-    is.read(reinterpret_cast<char *>(&version), sizeof(version));
-
-    PADDLE_ENFORCE_EQ(
-        version, 0U,
-        platform::errors::InvalidArgument(
-            "tensor version %u is not supported, Only version 0 is supported",
-            version));
-
-    paddle::framework::proto::VarType::TensorDesc desc;
-
-    {  // int32_t size
-      // proto buffer
-      int32_t size;
-      is.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      is.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE_EQ(
-          desc.ParseFromArray(buf.get(), size), true,
-          platform::errors::InvalidArgument("Cannot parse tensor desc"));
-    }
-
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-
-      int64_t line_numel = 1;
-      for (size_t dim = 1; dim < dims.size(); dim++) {
-        line_numel *= dims[dim];
-      }
-      auto total_line = dims[0];
-
-      tensor->Resize(paddle::framework::make_ddim(shape));
-
-      void *buf;
-      auto ctx = platform::CPUDeviceContext();
-
-      paddle::framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-
-      auto line_size =
-          line_numel * paddle::framework::SizeOfType(desc.data_type());
-      char *cur_buf = static_cast<char *>(buf);
-      char *temp_row = new char[line_size];
-      VLOG(4) << "TensorFromStream: line_size " << line_size;
-      VLOG(4) << "TensorFromStream: total_line " << total_line;
-      for (size_t line_index = 0; line_index < static_cast<size_t>(total_line);
-           ++line_index) {
-        is.read(temp_row, line_size);
-        if (static_cast<int64_t>(line_index) % node_num == node_index) {
-          memcpy(cur_buf, temp_row, line_size);
-          cur_buf += line_size;
-        }
-      }
-    }
-  }
-};
-
-class SparseTensorLoadOp : public paddle::framework::OperatorWithKernel {
- public:
-  using paddle::framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(paddle::framework::InferShapeContext *ctx) const override {}
-
- protected:
-  paddle::framework::OpKernelType GetExpectedKernelType(
-      const paddle::framework::ExecutionContext &ctx) const override {
-    paddle::framework::OpKernelType kt = paddle::framework::OpKernelType(
-        paddle::framework::proto::VarType::FP32, ctx.GetPlace());
-    return kt;
-  }
-};
-
-class SparseTensorLoadOpMaker
-    : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
-    AddAttr<std::string>("file_path",
-                         R"(Variable will be loaded from "file_path")")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-    AddAttr<int64_t>("node_index", "role id from 0 ~ node_num.").SetDefault(0);
-    AddAttr<int64_t>("node_num", "role nums which need load current varibale.")
-        .SetDefault(0);
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output")
-        .SetDefault({});
-    AddComment(R"DOC(
-    SparseTensorLoad OP, Load sprase tensor on parameter server
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sparse_tensor_load, ops::SparseTensorLoadOp,
-                  ops::SparseTensorLoadOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    sparse_tensor_load,
-    ops::SparseTensorLoadKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
deleted file mode 100644
index 042a22b8ff1998..00000000000000
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
-#include "paddle/fluid/operators/split_op.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class SplitByrefOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Ids", "SplitByrefOp");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "SplitByrefOp");
-
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    auto sections = ctx->Attrs().Get<std::vector<int>>("sections");
-    const size_t outs_number = outs_names.size();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(outs_number);
-
-    if (num > 0) {
-      int64_t in_axis_dim = 0;
-      if (ctx->IsRuntime()) {
-        in_axis_dim = in_dims[0];
-      }
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, platform::errors::InvalidArgument(
-                                                  "tensor split does not result"
-                                                  " in an equal division"));
-      size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[0] = out_axis_dim;
-        outs_dims.push_back(dim);
-      }
-    } else if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(
-          sections.size(), outs_number,
-          platform::errors::InvalidArgument("tensor split sections size"
-                                            "should be equal to output size"));
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[0] = sections[i];
-        outs_dims.push_back(dim);
-      }
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-  }
-};
-
-class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of the split operator.");
-    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SplitByref operator
-
-Split source tensor to sevaral tensors by axis 0. No copy in this operator
-is performed, output tensor shares the same blocks of memory.
-)DOC");
-    AddAttr<std::vector<int>>("sections",
-                              "(vector<int>) "
-                              "the length of each output along the "
-                              "specified axis.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-// NOTE: concat op default axis must be 0!
-USE_CPU_ONLY_OP(concat);
-
-REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
-                  ops::SplitGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
deleted file mode 100644
index 056659c3ea61f6..00000000000000
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_byref,
-    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.h b/paddle/fluid/operators/distributed_ops/split_byref_op.h
deleted file mode 100644
index fedd7218dd6cc9..00000000000000
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitByrefOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto place = ctx.GetPlace();
-
-    size_t row_offset = 0;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      // NOTE: no need to call mutable_data here to allocate memory.
-      auto* out = outs[i];
-      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
-      *out = in->Slice(row_offset, row_offset + out->dims()[0]);
-      row_offset += out->dims()[0];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
deleted file mode 100644
index 86738b7c69e8b5..00000000000000
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
-        .AsDuplicable();
-
-    AddOutput("Out", "(LoDTensors) The outputs of the input Ids.")
-        .AsDuplicable();
-
-    AddComment(R"DOC(
-Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
-Example:
-  Input:
-    X = [[1,2,3,4,5,6],[2,3]]
-
-  Out(3 output):
-    if compress is True:
-        out0 = [3, 3, 6]
-        out1 = [1, 4]
-        out2 = [2, 2, 5]
-    else:
-        out0 = [3, 6]
-        out1 = [1, 4]
-        out2 = [2, 5]
-)DOC");
-  }
-};
-
-class SplitIdsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("Ids"), "Input", "Ids", "SplitIdsOp");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "SplitIdsOp");
-
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0].size(), 2,
-          platform::errors::InvalidArgument(
-              "ShapeError: The dimensions of the 'split_ids' must be 2. "
-              "But received split_ids's dimensions = %d, "
-              "split_ids's shape = [%s].",
-              ids_dims[0].size(), ids_dims[0]));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Ids"), ctx.GetPlace());
-  }
-};
-
-class SplitIdsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetInputType("Ids");
-    ctx->SetOutputType("Out", input_type, framework::ALL_ELEMENTS);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
-                  ops::SplitIdsOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
-    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h
deleted file mode 100644
index 8a3ebe6e258e57..00000000000000
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iterator>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitIdsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    if (!platform::is_cpu_place(place)) {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "SplitIds do not support GPU kernel"));
-    }
-
-    const auto ids_vars = ctx.MultiInputVar("Ids");
-
-    PADDLE_ENFORCE_GT(
-        ids_vars.size(), 0,
-        platform::errors::InvalidArgument(
-            ids_vars.size(), 0, "The number of Ids expected > 0, but got %d",
-            ids_vars.size()));
-    auto *ids_var = ids_vars[0];
-
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      int batch_size = 0;
-      const auto ids_tensors = ctx.MultiInput<framework::LoDTensor>("Ids");
-      for (size_t i = 0; i < ids_tensors.size(); ++i) {
-        batch_size += ids_tensors[i]->dims()[0];
-      }
-      VLOG(4) << "Get Total BatchSize is: " << batch_size;
-
-      std::vector<T> all_ids(batch_size);
-      int offset = 0;
-      for (size_t i = 0; i < ids_tensors.size(); ++i) {
-        const auto *ids = ids_tensors[i];
-        std::memcpy(all_ids.data() + offset, ids->data<T>(),
-                    ids->numel() * sizeof(T));
-        offset += ids->numel();
-      }
-
-      std::set<T> st(all_ids.begin(), all_ids.end());
-      all_ids.assign(st.begin(), st.end());
-
-      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-      const size_t shard_num = outs.size();
-      std::vector<std::vector<T>> out_ids;
-      out_ids.resize(outs.size());
-
-      // split id by their shard_num.
-      for (size_t i = 0; i < all_ids.size(); ++i) {
-        T id = all_ids[i];
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        out_ids[shard_id].push_back(id);
-      }
-
-      // create tensor for each shard and send to parameter server
-      for (size_t i = 0; i < out_ids.size(); ++i) {
-        auto *shard_t = outs[i];
-        std::vector<T> ids = out_ids[i];
-        auto *shard_data = shard_t->mutable_data<T>(
-            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-        for (size_t i = 0; i < ids.size(); ++i) {
-          shard_data[i] = ids[i];
-        }
-      }
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
-      auto &ids_dims = ids_selected_rows->value().dims();
-      const T *ids_data = ids_selected_rows->value().data<T>();
-      const auto &ids_rows = ids_selected_rows->rows();
-      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-      const size_t shard_num = outs.size();
-      for (auto &out : outs) {
-        out->mutable_rows()->clear();
-      }
-      // get rows for outputs
-      std::unordered_map<int64_t, size_t> id_to_index;
-      for (size_t i = 0; i < ids_rows.size(); ++i) {
-        id_to_index[ids_rows[i]] = i;
-        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
-      }
-
-      int64_t row_width = ids_dims[1];
-      for (auto &out : outs) {
-        out->set_height(ids_selected_rows->height());
-        framework::DDim ddim = framework::make_ddim(
-            {static_cast<int64_t>(out->rows().size()), row_width});
-        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
-        for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width,
-                 ids_data + id_to_index[out->rows()[i]] * row_width,
-                 row_width * sizeof(T));
-        }
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "% should be LoDTensor or SelectedRows, but the received type is %s",
-          ctx.InputNames("Ids")[0], framework::ToTypeName(ids_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
deleted file mode 100644
index b65621a0886b02..00000000000000
--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#include "paddle/fluid/string/printf.h"
-
-#ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#endif
-
-USE_NO_KERNEL_OP(listen_and_serv);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-namespace distributed = paddle::operators::distributed;
-namespace string = paddle::string;
-
-std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<distributed::RequestHandler> g_req_handler;
-
-void StartServer() {
-  f::Scope scope;
-  p::CPUPlace place;
-  scope.Var(NCCL_ID_VARNAME);
-  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(p::CPUPlace());
-
-  f::ProgramDesc empty_program;
-  f::Executor executor(dev_ctx.GetPlace());
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetDevCtx(&dev_ctx);
-  g_req_handler->SetProgram(&empty_program);
-  g_req_handler->SetExecutor(&executor);
-
-  g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  g_rpc_service->SetCond(distributed::kRequestSend);
-  g_rpc_service->WaitBarrier(distributed::kRequestSend);
-
-  LOG(INFO) << "got nccl id and stop server...";
-  g_rpc_service->ShutDown();
-  server_thread.join();
-}
-
-TEST(SendNcclId, RPCServer) {
-  g_req_handler.reset(
-      new distributed::RequestSendHandler(distributed::DistributedMode::kSync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-
-  std::thread server_thread(StartServer);
-  g_rpc_service->WaitServerReady();
-
-  f::Scope scope;
-  p::CPUPlace place;
-  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(p::CPUPlace());
-
-  auto var = scope.Var(NCCL_ID_VARNAME);
-  auto id = var->GetMutable<ncclUniqueId>();
-  p::dynload::ncclGetUniqueId(id);
-
-  int port = g_rpc_service->GetSelectedPort();
-
-  std::string ep = string::Sprintf("127.0.0.1:%d", port);
-
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-  LOG(INFO) << "connect to server" << ep;
-  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
-  client->Wait();
-  client->AsyncSendBatchBarrier(ep);
-  client->Wait();
-
-  server_thread.join();
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/eigen/CMakeLists.txt b/paddle/fluid/operators/eigen/CMakeLists.txt
new file mode 100644
index 00000000000000..848bf2433c5e39
--- /dev/null
+++ b/paddle/fluid/operators/eigen/CMakeLists.txt
@@ -0,0 +1,10 @@
+file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+cc_library(eigen_cc_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
+if(WITH_GPU OR WITH_ROCM)
+  file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
+  if(WITH_GPU)
+    nv_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3)
+  elseif(WITH_ROCM)
+    hip_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3)
+  endif()
+endif()
diff --git a/paddle/fluid/operators/eigen/broadcast.cc b/paddle/fluid/operators/eigen/broadcast.cc
new file mode 100644
index 00000000000000..dab25f95493726
--- /dev/null
+++ b/paddle/fluid/operators/eigen/broadcast.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenBroadcast<Eigen::DefaultDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in,
+                   const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+};
+
+template <typename T, int Rank>
+struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims) {
+    out.device(dev) =
+        in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions());
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, T)                      \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 1>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 2>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 3>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 4>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 5>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
+INSTANTIATION(EigenBroadcast, bool);
+INSTANTIATION(EigenBroadcast, platform::float16);
+INSTANTIATION(EigenBroadcast, float);
+INSTANTIATION(EigenBroadcast, double);
+INSTANTIATION(EigenBroadcast, int);
+INSTANTIATION(EigenBroadcast, int64_t);
+INSTANTIATION(EigenBroadcastGrad, bool);
+INSTANTIATION(EigenBroadcastGrad, float);
+INSTANTIATION(EigenBroadcastGrad, platform::float16);
+INSTANTIATION(EigenBroadcastGrad, double);
+INSTANTIATION(EigenBroadcastGrad, int);
+INSTANTIATION(EigenBroadcastGrad, int64_t);
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, float, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, double, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, int, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, int64_t, 0>;
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/broadcast.cu b/paddle/fluid/operators/eigen/broadcast.cu
new file mode 100644
index 00000000000000..63e244d393a9bc
--- /dev/null
+++ b/paddle/fluid/operators/eigen/broadcast.cu
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenBroadcast<Eigen::GpuDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in,
+                   const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+};
+
+template <typename T, int Rank>
+struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims) {
+    out.device(dev) =
+        in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions());
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, T)                  \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 1>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 2>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 3>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 4>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 5>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 6>
+INSTANTIATION(EigenBroadcast, bool);
+INSTANTIATION(EigenBroadcast, platform::float16);
+INSTANTIATION(EigenBroadcast, float);
+INSTANTIATION(EigenBroadcast, double);
+INSTANTIATION(EigenBroadcast, int);
+INSTANTIATION(EigenBroadcast, int64_t);
+INSTANTIATION(EigenBroadcastGrad, bool);
+INSTANTIATION(EigenBroadcastGrad, float);
+INSTANTIATION(EigenBroadcastGrad, platform::float16);
+INSTANTIATION(EigenBroadcastGrad, double);
+INSTANTIATION(EigenBroadcastGrad, int);
+INSTANTIATION(EigenBroadcastGrad, int64_t);
+template struct EigenBroadcastGrad<Eigen::GpuDevice, float, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, platform::float16, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, double, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, int, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, int64_t, 0>;
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h
new file mode 100644
index 00000000000000..59669505959f3f
--- /dev/null
+++ b/paddle/fluid/operators/eigen/eigen_function.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenBroadcast {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  static void Eval(const EigenDevice& dev, OutType out, InType in,
+                   const Array& bcast);
+  static void Eval(const EigenDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenBroadcastGrad {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims);
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt
index 06ca98e526e95b..216a3f79d6f920 100644
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
@@ -8,3 +8,7 @@ register_operators(DEPS op_version_registry)
 cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
 cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)
 cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
+
+if(WITH_ASCEND_CL)
+cc_test(elementwise_op_npu_test SRCS elementwise_op_npu_test.cc DEPS op_registry elementwise_add_op elementwise_sub_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 8de6416065d9a7..0ca03fc32fbf67 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -23,51 +24,59 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
+/*
+   input: an array;
+   return: the result of the math functor
+   1. For Unary Op, the length of input array is 1,
+      e.g. Relu: return args[0] > 0 ? args[0] : 0;
+   2. For Binary Op, the length of input array is 2,
+      e.g. Add: return args[0] + args[1];
+*/
 template <typename T>
-struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    AddRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
-  }
+struct CudaAddFunctor {
+  inline HOSTDEVICE T operator()(T args[]) const { return args[0] + args[1]; }
 };
 
-template <>
-struct SameDimsElemwiseAdd<platform::CUDADeviceContext, platform::float16> {
+template <typename T>
+struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
-    auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseAddCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+    std::vector<const framework::Tensor*> ins = {x, y};
+    std::vector<framework::Tensor*> outs = {z};
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(
+        ctx.template device_context<platform::CUDADeviceContext>(), ins, &outs,
+        CudaAddFunctor<T>());
   }
 };
 
 template <typename T>
-static __global__ void SimpleElemwiseAddGradCUDAKernel(const T* dout,
-                                                       int64_t size, T* dx,
-                                                       T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
+static __global__ void SimpleElemwiseAddGradCUDAKernel(
+    const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  int loop = size / vec_size;
+  int remainder = size % vec_size;
+  const float4* dout_vec = reinterpret_cast<const float4*>(dout);
+  float4* dx_vec = reinterpret_cast<float4*>(dx);
+  float4* dy_vec = reinterpret_cast<float4*>(dy);
+  float4 tmp_loop;
+
+  for (int i = tid; i < loop; i += stride) {
+    tmp_loop = dout_vec[i];
+    dx_vec[i] = tmp_loop;
+    dy_vec[i] = tmp_loop;
+  }
 
-  while (col < size) {
-    dx[col] = dout[col];
-    dy[col] = dout[col];
-    col += blockDim.x * gridDim.x;
+  if (tid == loop && remainder != 0) {
+    T tmp_rem;
+    while (remainder) {
+      int idx = size - remainder;
+      remainder--;
+      tmp_rem = dout[idx];
+      dx[idx] = tmp_rem;
+      dy[idx] = tmp_rem;
+    }
   }
 }
 
@@ -79,15 +88,39 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
                      const framework::Tensor* out,
                      const framework::Tensor* dout, framework::Tensor* dx,
                      framework::Tensor* dy) {
-  dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-  auto size = x->numel();
-  dim3 grid_size =
-      dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
-  SimpleElemwiseAddGradCUDAKernel<
-      T><<<grid_size, block_size, 0,
-           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-      dout->data<T>(), size, dx->mutable_data<T>(ctx.GetPlace()),
-      dy->mutable_data<T>(ctx.GetPlace()));
+  auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+  auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+  auto* dout_data = dout->data<T>();
+  if (dx_data == dout_data && dy_data != dout_data) {
+    VLOG(4) << "Special case when dx_data is the same as dout_data, "
+               "only need copy dout to dy";
+    framework::TensorCopy(
+        *dout, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), dy);
+  } else if (dx_data != dout_data && dy_data == dout_data) {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "only need copy dout to dx";
+    framework::TensorCopy(
+        *dout, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), dx);
+  } else if (dx_data != dout_data && dy_data != dout_data) {
+    auto size = x->numel();
+    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) /
+                 PADDLE_CUDA_THREAD_SIZE,
+             1);
+    SimpleElemwiseAddGradCUDAKernel<
+        T><<<grid_size, block_size, 0,
+             ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
+        dout->data<T>(), size, vec_size, dx->mutable_data<T>(ctx.GetPlace()),
+        dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "and dx_data is the same as dout_data, do not need "
+               "any operator";
+  }
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
new file mode 100644
index 00000000000000..5b8d08a8943dde
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
+    // default axis=-1?
+    // So, the sub_grad should do reduce if needed.
+    // For example, the shape of each variable in elementwise_sub:
+    // x, dx: [2, 3, 5]
+    // y, dy: [1, 5]
+    // out, dout: [2, 3, 5]
+    // Then, out = x - y  =>  dx = dout, dy = -dout
+    // And, the shape of dy can be computed by two stages reduce,
+    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
+    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
+      }
+    }
+
+    if (dy) {
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dout->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        dy->mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel<float>,
+                       ops::ElementwiseAddNPUKernel<plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
+                       ops::ElementwiseAddGradNPUKernel<float>,
+                       ops::ElementwiseAddGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 96583d06571c82..0cf9294c9de67f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseDiv<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
new file mode 100644
index 00000000000000..8852f3a419adc5
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor y_power(y->type());
+    y_power.mutable_data<T>(y->dims(), place);
+    auto y_power_runner = NpuOpRunner("Power", {*y}, {y_power},
+                                      {{"power", static_cast<float>(-1)}});
+    y_power_runner.Run(stream);
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+
+      Tensor tensor_zeros(x->type());
+      tensor_zeros.mutable_data<T>(x->dims(), place);
+      auto tensor_zeros_runner =
+          NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
+      tensor_zeros_runner.Run(stream);
+
+      Tensor x_zero(paddle::framework::proto::VarType::BOOL);
+      x_zero.mutable_data<bool>(x->dims(), place);
+      auto x_zero_runner =
+          NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
+      x_zero_runner.Run(stream);
+
+      Tensor x_nozero(paddle::framework::proto::VarType::BOOL);
+      x_nozero.mutable_data<bool>(x->dims(), place);
+      auto x_nozero_runner =
+          NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
+      x_nozero_runner.Run(stream);
+
+      Tensor x_nozero_f(x->type());
+      x_nozero_f.mutable_data<T>(x->dims(), place);
+      auto x_nozero_f_runner =
+          NpuOpRunner("Cast", {x_nozero}, {x_nozero_f},
+                      {{"dst_type", static_cast<int32_t>(0)}});
+      x_nozero_f_runner.Run(stream);
+
+      Tensor x_grad_w(x->type());
+      x_grad_w.mutable_data<T>(x->dims(), place);
+      auto x_grad_w_runner =
+          NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {});
+      x_grad_w_runner.Run(stream);
+
+      auto x_grad_runner = NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
+      x_grad_runner.Run(stream);
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(place);
+
+      Tensor neg_out(y->type());
+      neg_out.mutable_data<T>(y->dims(), place);
+      auto neg_out_runner = NpuOpRunner("Neg", {*out}, {neg_out}, {});
+      neg_out_runner.Run(stream);
+
+      Tensor y_grad_w(y->type());
+      y_grad_w.mutable_data<T>(y->dims(), place);
+      auto y_grad_w_runner = NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {});
+      y_grad_w_runner.Run(stream);
+
+      auto y_grad_runner = NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {});
+      y_grad_runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                     paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
new file mode 100644
index 00000000000000..da0116114747fa
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(elementwise_floordiv,
+                       ops::ElementwiseFloorDivNPUKernel<int>,
+                       ops::ElementwiseFloorDivNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
new file mode 100644
index 00000000000000..3cdb6420e8ee1d
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMaxNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
new file mode 100644
index 00000000000000..987c250d651475
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 5b598ab2d788eb..e01b5eb5fb73d9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
new file mode 100644
index 00000000000000..08df6d4e27af0a
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+      auto dx_runner = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
+      dx_runner.Run(stream);
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(place);
+      auto dy_runner = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
+      dy_runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_mul,
+    ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_mul_grad,
+    ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                     paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
index 1121d0ef68ce2f..8344b3d9838b00 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
@@ -18,7 +18,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#ifdef __HIPCC__
+#define PADDLE_CUDA_THREAD_SIZE 256
+#else
 #define PADDLE_CUDA_THREAD_SIZE 512
+#endif
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -34,10 +38,6 @@ limitations under the License. */
 #endif
 #endif  // PADDLE_WITH_HIP
 
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
-#define __h2div h2div
-#endif
-
 #define DIV_ERROR_INFO                                                     \
   "InvalidArgumentError: Integer division by zero encountered in divide. " \
   "Please check.\n"
@@ -162,32 +162,62 @@ inline DEVICE half2 half2_div(const half2& a, const half2& b) {
 #endif
 }
 
-#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function)           \
-  template <typename T>                                                      \
-  __global__ void SameDimsElemwise##Func##CUDAKernel(const T* x, const T* y, \
-                                                     T* z, int64_t size) {   \
-    int col = blockIdx.x * blockDim.x + threadIdx.x;                         \
-    while (col < size) {                                                     \
-      z[col] = x[col] expr y[col];                                           \
-      col += blockDim.x * gridDim.x;                                         \
-    }                                                                        \
-  }                                                                          \
-  template <>                                                                \
-  inline __global__ void SameDimsElemwise##Func##CUDAKernel<half>(           \
-      const half* x, const half* y, half* z, int64_t size) {                 \
-    int start = threadIdx.x + blockDim.x * blockIdx.x;                       \
-    int stride = blockDim.x * gridDim.x;                                     \
-    int n2 = size / 2;                                                       \
-    const half2* x2 = reinterpret_cast<const half2*>(x);                     \
-    const half2* y2 = reinterpret_cast<const half2*>(y);                     \
-    half2* z2 = reinterpret_cast<half2*>(z);                                 \
-    for (int i = start; i < n2; i += stride) {                               \
-      z2[i] = FP16Function(x2[i], y2[i]);                                    \
-    }                                                                        \
-    if (start == 0 && (size % 2)) {                                          \
-      z[size - 1] = __float2half(__half2float(x[size - 1])                   \
-                                     expr __half2float(y[size - 1]));        \
-    }                                                                        \
+#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function)             \
+  inline __global__ void SameDimsElemwise##Func##CUDAKernel(                   \
+      const float* __restrict__ x, const float* __restrict__ y, float* z,      \
+      int64_t size) {                                                          \
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;                           \
+    int stride = gridDim.x * blockDim.x;                                       \
+    int loop = size / 4;                                                       \
+    int remainder = size % 4;                                                  \
+    const float4* x_vec = reinterpret_cast<const float4*>(x);                  \
+    const float4* y_vec = reinterpret_cast<const float4*>(y);                  \
+    float4* z_vec = reinterpret_cast<float4*>(z);                              \
+    float4 x_f4, y_f4;                                                         \
+    for (int i = tid; i < loop; i += stride) {                                 \
+      x_f4 = x_vec[i];                                                         \
+      y_f4 = y_vec[i];                                                         \
+      z_vec[i] = make_float4(x_f4.x expr y_f4.x, x_f4.y expr y_f4.y,           \
+                             x_f4.z expr y_f4.z, x_f4.w expr y_f4.w);          \
+    }                                                                          \
+    if (tid == loop && remainder != 0) {                                       \
+      while (remainder) {                                                      \
+        int idx = size - remainder;                                            \
+        remainder--;                                                           \
+        z[idx] = x[idx] expr y[idx];                                           \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  inline __global__ void SameDimsElemwise##Func##CUDAKernel(                   \
+      const half* __restrict__ x, const half* __restrict__ y, half* z,         \
+      int64_t size) {                                                          \
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;                           \
+    int stride = gridDim.x * blockDim.x;                                       \
+    int loop = size / 8;                                                       \
+    int remainder = size % 8;                                                  \
+    const float4* x_vec = reinterpret_cast<const float4*>(x);                  \
+    const float4* y_vec = reinterpret_cast<const float4*>(y);                  \
+    float4* z_vec = reinterpret_cast<float4*>(z);                              \
+    float4 x_h8, y_h8, z_h8;                                                   \
+    for (int i = tid; i < loop; i += stride) {                                 \
+      x_h8 = x_vec[i];                                                         \
+      y_h8 = y_vec[i];                                                         \
+      half2* x_h2 = reinterpret_cast<half2*>(&x_h8);                           \
+      half2* y_h2 = reinterpret_cast<half2*>(&y_h8);                           \
+      half2* z_h2 = reinterpret_cast<half2*>(&z_h8);                           \
+      z_h2[0] = FP16Function(x_h2[0], y_h2[0]);                                \
+      z_h2[1] = FP16Function(x_h2[1], y_h2[1]);                                \
+      z_h2[2] = FP16Function(x_h2[2], y_h2[2]);                                \
+      z_h2[3] = FP16Function(x_h2[3], y_h2[3]);                                \
+      z_vec[i] = z_h8;                                                         \
+    }                                                                          \
+    if (tid == loop && remainder != 0) {                                       \
+      while (remainder) {                                                      \
+        int idx = size - remainder;                                            \
+        remainder--;                                                           \
+        z[idx] = __float2half(__half2float(x[idx]) expr __half2float(y[idx])); \
+      }                                                                        \
+    }                                                                          \
   }
 DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Add, +, half2_add)
 DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Sub, -, half2_sub)
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index c69baadb3c22e4..32e49cf3996f12 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -39,7 +39,11 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#ifdef __HIPCC__
+constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
+#else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
+#endif
 #define BLOCK_X 32
 #define BLOCK_Y 32
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
new file mode 100644
index 00000000000000..36add2112974dc
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -0,0 +1,205 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+namespace paddle {
+namespace operators {
+
+enum ElementwiseType { kUnary = 1, kBinary = 2 };
+
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) CudaAlignedVector {
+  T val[Size];
+};
+
+template <typename T>
+int GetVectorizedSizeImpl(const T *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec4 =
+      std::alignment_of<CudaAlignedVector<T, 4>>::value;  // NOLINT
+  constexpr int vec2 =
+      std::alignment_of<CudaAlignedVector<T, 2>>::value;  // NOLINT
+  if (address % vec4 == 0) {
+    return 4;
+  } else if (address % vec2 == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+template <typename T>
+int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins,
+                      const std::vector<framework::Tensor *> &outs) {
+  int vec_size = 4;
+  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
+    vec_size =
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+  }
+  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
+    vec_size =
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+  }
+  return vec_size;
+}
+
+template <ElementwiseType ET, int VecSize, typename T>
+struct ElementwiseDataWrapper {
+  T *out;
+  const T *in0;
+  const T *in1;
+  __device__ ElementwiseDataWrapper(T *out, const T *in0,
+                                    const T *in1 = nullptr)
+      : out(out), in0(in0), in1(in1) {}
+
+  using VecType = CudaAlignedVector<T, VecSize>;
+
+  inline __device__ void load_vector(VecType args[], int idx) {
+    const VecType *x_vec = reinterpret_cast<const VecType *>(in0);
+    args[0] = x_vec[idx];
+    if (ET == ElementwiseType::kBinary) {
+      const VecType *y_vec = reinterpret_cast<const VecType *>(in1);
+      args[1] = y_vec[idx];
+    }
+  }
+
+  inline __device__ void load_scalar(T args[], int idx) {
+    args[0] = in0[idx];
+    if (ET == ElementwiseType::kBinary) {
+      args[1] = in1[idx];
+    }
+  }
+
+  inline __device__ void store_vector(VecType res, int idx) {
+    VecType *out_vec = reinterpret_cast<VecType *>(out);
+    out_vec[idx] = res;
+  }
+
+  inline __device__ void store_scalar(T res, int idx) { out[idx] = res; }
+};
+
+template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+__device__ void VectorizedKernelImpl(
+    ElementwiseDataWrapper<ET, VecSize, T> data, int size, Functor func,
+    int tid) {
+  using VecType = CudaAlignedVector<T, VecSize>;
+  VecType ins_vec[ET];
+  VecType out_vec;
+  T *ins_ptr[ET];
+  T *out_ptr;
+#pragma unroll
+  for (int i = 0; i < ET; ++i) {
+    ins_ptr[i] = reinterpret_cast<T *>(&(ins_vec[i]));
+  }
+  out_ptr = reinterpret_cast<T *>(&out_vec);
+
+  // load
+  data.load_vector(ins_vec, tid);
+
+// compute
+#pragma unroll
+  for (int i = 0; i < VecSize; ++i) {
+    T ins[ET];
+#pragma unroll
+    for (int j = 0; j < ET; ++j) {
+      ins[j] = ins_ptr[j][i];
+    }
+    out_ptr[i] = func(ins);
+  }
+
+  // store
+  data.store_vector(out_vec, tid);
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, 1, T> data,
+                                 int size, Functor func, int start,
+                                 int remain) {
+  T ins[ET];
+  T out;
+
+  for (int i = 0; i < remain; ++i) {
+    int idx = start + i;
+    // load
+    data.load_scalar(ins, idx);
+    // compute
+    out = func(ins);
+    // store
+    data.store_scalar(out, idx);
+  }
+}
+
+template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+__global__ void VectorizedKernel(const T *__restrict__ in0,
+                                 const T *__restrict__ in1, T *out, int size,
+                                 Functor func) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int remain = size - VecSize * tid;
+  remain = remain > 0 ? remain : 0;
+  if (remain >= VecSize) {
+    auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
+    VectorizedKernelImpl(data, size, func, tid);
+  } else {
+    auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
+    ScalarKernelImpl(data, size, func, tid * VecSize, remain);
+  }
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+__global__ void ScalarKernel(const T *__restrict__ in0,
+                             const T *__restrict__ in1, T *out, int size,
+                             Functor func) {
+  auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int remain = tid < size ? 1 : 0;
+  ScalarKernelImpl(data, size, func, tid, remain);
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void LaunchElementwiseCudaKernel(
+    const platform::CUDADeviceContext &ctx,
+    const std::vector<const framework::Tensor *> &ins,
+    std::vector<framework::Tensor *> *outs, Functor func) {
+  // calculate the max vec_size for all ins and outs
+  auto size = ins[0]->numel();
+  int vec_size = GetVectorizedSize<T>(ins, *outs);
+  int block_size = PADDLE_CUDA_THREAD_SIZE;
+  int grid_size =
+      ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
+  const T *in0 = ins[0]->data<T>();
+  const T *in1 = (ET == ElementwiseType::kBinary) ? ins[1]->data<T>() : nullptr;
+  T *out = (*outs)[0]->data<T>();
+  // cuda kernel
+  auto stream = ctx.stream();
+  switch (vec_size) {
+    case 4:
+      VectorizedKernel<ET, 4><<<grid_size, block_size, 0, stream>>>(
+          in0, in1, out, size, func);
+      break;
+    case 2:
+      VectorizedKernel<ET, 2><<<grid_size, block_size, 0, stream>>>(
+          in0, in1, out, size, func);
+      break;
+    case 1:
+      ScalarKernel<ET><<<grid_size, block_size, 0, stream>>>(in0, in1, out,
+                                                             size, func);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
new file mode 100644
index 00000000000000..df6fae6c8484a0
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(elementwise_add);
+USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
+USE_OP(elementwise_sub);
+USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto y = scope->Var("Y");
+  auto tensor_y = y->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_y;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_y.push_back(static_cast<T>(2.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+  TensorFromVector(init_y, ctx, tensor_y);
+  tensor_y->Resize({10, 10});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Y", {"Y"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+  float expected;
+  if (op_type == "elementwise_add") {
+    expected = 3.0;
+  } else if (op_type == "elementwise_sub") {
+    expected = -1.0;
+  }
+  EXPECT_EQ(out_vec.size(), init_x.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+                 std::string op_type) {
+  // init
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+  tensor_dout->Resize({2, 3, 5});
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  tensor_x->Resize({2, 3, 5});
+
+  auto y = scope->Var("Y");
+  auto tensor_y = y->GetMutable<f::LoDTensor>();
+  tensor_y->Resize({1, 5});
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  auto dy = scope->Var("DY");
+  auto tensor_dy = dy->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_dout;
+  for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
+    init_dout.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize({2, 3, 5});
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
+      {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  std::vector<T> dy_vec;
+  TensorToVector(*tensor_dy, ctx, &dy_vec);
+
+  ctx.Wait();
+  float expected_x, expected_y;
+  if (op_type == "elementwise_add_grad") {
+    expected_x = 1.0;
+    expected_y = 6.0;
+  } else if (op_type == "elementwise_sub_grad") {
+    expected_x = 1.0;
+    expected_y = -6.0;
+  }
+
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
+  }
+  for (uint32_t i = 0; i < dy_vec.size(); i++) {
+    EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
+  }
+}
+
+TEST(elementwise_add, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "elementwise_add");
+}
+
+TEST(elementwise_sub, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "elementwise_sub");
+}
+
+TEST(elementwise_sub, NPU_fp16) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<p::float16>(&scope, ctx, "elementwise_sub");
+}
+
+TEST(elementwise_sub_grad, NPU) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+}
+
+TEST(elementwise_add_grad, NPU) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
+}
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
new file mode 100644
index 00000000000000..26cc925b869c64
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwisePowNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwisePowNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 1996cc471ac2a0..192999fd2ac831 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
new file mode 100644
index 00000000000000..809445c2862035
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
+    // default axis=-1?
+    // So, the sub_grad should do reduce if needed.
+    // For example, the shape of each variable in elementwise_sub:
+    // x, dx: [2, 3, 5]
+    // y, dy: [1, 5]
+    // out, dout: [2, 3, 5]
+    // Then, out = x - y  =>  dx = dout, dy = -dout
+    // And, the shape of dy can be computed by two stages reduce,
+    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
+    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
+      }
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dy(dy->type());
+      Tensor reduced_dout(dy->type());
+
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      Tensor* tmp_dy = tmp_dout;
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        reduced_dy.Resize(dy->dims());
+        reduced_dy.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+        tmp_dy = &reduced_dy;
+      }
+
+      // stage 3, negative
+      auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<float>,
+                       ops::ElementwiseSubNPUKernel<plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
+                       ops::ElementwiseSubGradNPUKernel<float>,
+                       ops::ElementwiseSubGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index b43dddfcf19db3..8f519de075760e 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -86,7 +86,8 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         platform::ReductionMKLDNNHandler<T> handler_sum(
             dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
             ctx.GetPlace(), dout, dy,
-            ctx.InputName(framework::GradVarName("Out")));
+            ctx.InputName(framework::GradVarName("Out")),
+            CalculateBroadcastedDims(dout, dy));
         auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
         auto reduction_p = handler_sum.AcquireForwardPrimitive();
         reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index df827117a0d302..e5d20893335f70 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -81,5 +81,20 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     z->set_format(platform::GetMKLDNNFormat(*dst_memory));
   }
 };
+
+inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
+                                                     const Tensor* y) {
+  const auto src_tz = framework::vectorize(x->dims());
+  const auto dst_tz = framework::vectorize(y->dims());
+
+  size_t j = 0;
+  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+  for (size_t i = 0; i < src_tz.size(); ++i) {
+    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+    if (j == dst_tz.size()) break;
+  }
+
+  return dst_tz_ex;
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index c9209cc39d5e35..1c246e8d189370 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -105,7 +105,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         platform::ReductionMKLDNNHandler<T> handler_sum(
             dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, mkldnn_engine,
             ctx.GetPlace(), dout, dy,
-            ctx.InputName(framework::GradVarName("Out")));
+            ctx.InputName(framework::GradVarName("Out")),
+            CalculateBroadcastedDims(dout, dy));
         auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
         auto reduction_p = handler_sum.AcquireForwardPrimitive();
         // As source we use mem object with results from binary operation
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
old mode 100644
new mode 100755
index cbaeb0c4e42564..406455af741715
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -22,9 +22,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define EXPAND_AS_TEMPLATE(z, n, data) \
   case n + 1: {                        \
     ExpandAs<n + 1>(context);          \
@@ -32,10 +40,10 @@ limitations under the License. */
   }
 #define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_AS_GRAD_CASE(n)                                       \
-  case n: {                                                          \
-    ExpandAsBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                           \
+#define EXPAND_AS_GRAD_CASE(n)                                           \
+  case n + 1: {                                                          \
+    ExpandAsBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                               \
   }
 #define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), )
@@ -75,7 +83,7 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     auto in_dims = in0->dims();
     auto* target_tensor = context.Input<Tensor>("target_tensor");
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     int bcast_dims_remainder = 0;
     auto x_dims = in0->dims();
     auto y_dims = target_tensor->dims();
@@ -104,7 +112,8 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     auto y = EigenTensor<T, Rank>::From(*out0);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                 bcast_dims);
   }
 };
 
@@ -143,6 +152,18 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
       framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
                             out0);
     } else {
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_as_grad op must be greater than or "
+                            "equal to 1, but the value received is %d.",
+                            dims));
+      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_as_grad op must be less than or equal "
+                            "to %d, but the value received is %d.",
+                            MAX_RANK_SUPPORTED, dims));
       switch (dims) {
         REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
         default:
@@ -165,20 +186,19 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
old mode 100644
new mode 100755
index c36e461926f5c1..6df4c592378cb2
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -23,9 +23,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define EXPAND_AS_TEMPLATE(z, n, data) \
   case n + 1: {                        \
     ExpandAs<n + 1>(context);          \
@@ -33,10 +41,10 @@ limitations under the License. */
   }
 #define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_AS_GRAD_CASE(n)                                       \
-  case n: {                                                          \
-    ExpandAsBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                           \
+#define EXPAND_AS_GRAD_CASE(n)                                           \
+  case n + 1: {                                                          \
+    ExpandAsBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                               \
   }
 #define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), )
@@ -108,7 +116,7 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
       }
     }
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -122,7 +130,8 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
     auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                 bcast_dims);
   }
 };
 
@@ -176,7 +185,14 @@ class ExpandAsV2GradKernel : public framework::OpKernel<T> {
                             "expand_as_v2_grad op must be less than or equal "
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
-      switch (dims) { REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+      switch (dims) {
+        REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
+      }
     }
   }
 
@@ -191,20 +207,19 @@ class ExpandAsV2GradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
old mode 100644
new mode 100755
index 8b79a1feb8ce1f..e566d69096595c
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -25,9 +25,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define EXPAND_TEMPLATE(z, n, data) \
   case n + 1: {                     \
     Expand<n + 1>(context);         \
@@ -35,10 +43,10 @@ limitations under the License. */
   }
 #define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_GRAD_CASE(n)                                        \
-  case n: {                                                        \
-    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                         \
+#define EXPAND_GRAD_CASE(n)                                            \
+  case n + 1: {                                                        \
+    ExpandBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                             \
   }
 #define EXPAND_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
@@ -56,6 +64,12 @@ inline std::vector<int> get_expand_times(
       TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
       expand_data = cpu_expand_tensor.data<int>();
     }
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(expand_tensor->place())) {
+      TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
+      expand_data = cpu_expand_tensor.data<int>();
+    }
+#endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(expand_tensor->place())) {
       TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
@@ -141,7 +155,7 @@ class ExpandKernel : public framework::OpKernel<T> {
             "of dimensions (%d) of the input.",
             expand_times.size(), static_cast<size_t>(in_dims.size())));
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < expand_times.size(); ++i) {
       bcast_dims[i] = expand_times[i];
     }
@@ -160,9 +174,11 @@ class ExpandKernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -216,7 +232,14 @@ class ExpandGradKernel : public framework::OpKernel<T> {
                             "for Op(expand_grad) must be less than or equal "
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
-      switch (dims) { REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
+      }
     }
   }
 
@@ -241,20 +264,19 @@ class ExpandGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
new file mode 100644
index 00000000000000..f4ae1785b024f5
--- /dev/null
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ExpandNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'x' for Op(expand) "
+            "must be greater than or equal to 1, but the value received is %d.",
+            rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'x' for Op(expand) "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<framework::LoDTensor>("X");
+    auto in_dims = in0->dims();
+    auto expand_times = get_expand_times(context);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(in_dims.size()), expand_times.size(),
+        platform::errors::InvalidArgument(
+            "The number of elements (%d) of 'expand_times' for "
+            "Op(expand) must be equal to the number "
+            "of dimensions (%d) of the input.",
+            expand_times.size(), static_cast<size_t>(in_dims.size())));
+    auto* out0 = context.Output<framework::LoDTensor>("Out");
+    framework::DDim out_dims(in_dims);
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      out_dims[i] *= expand_times[i];
+    }
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(context.device_context().GetPlace());
+    auto runner =
+        NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    expand, ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
+                         paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
new file mode 100644
index 00000000000000..95f7865a8a3a4e
--- /dev/null
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <iostream>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(expand);
+USE_OP_DEVICE_KERNEL(expand, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto in = scope->Var("X");
+  auto expand_times = scope->Var("ExpandTimes");
+  auto out = scope->Var("Out");
+  auto in_t = in->GetMutable<f::LoDTensor>();
+  auto out_t = out->GetMutable<f::LoDTensor>();
+  auto expand_times_t = expand_times->GetMutable<f::LoDTensor>();
+
+  auto place = ctx.GetPlace();
+  TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
+  TensorFromVector(std::vector<int>({1, 10, 1}), ctx, expand_times_t);
+
+  in_t->Resize(f::make_ddim({3, 1, 7}));
+  expand_times_t->Resize(f::make_ddim({3}));
+  out_t->Resize(f::make_ddim({3, 10, 7}));
+  out_t->mutable_data<T>(place);
+
+  f::AttributeMap attrs = {{}};
+  auto op = f::OpRegistry::CreateOp(
+      "expand", {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}},
+      {{"Out", {"Out"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  auto out_dim = out_t->dims();
+  EXPECT_EQ(out_dim.at(0), 3);
+  EXPECT_EQ(out_dim.at(1), 10);
+  EXPECT_EQ(out_dim.at(2), 7);
+}
+
+TEST(expand, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
old mode 100644
new mode 100755
index ec9c6e62f272ed..8a87a067c51f11
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -26,9 +26,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define EXPAND_TEMPLATE(z, n, data) \
   case n + 1: {                     \
     Expand<n + 1>(context);         \
@@ -36,10 +44,10 @@ limitations under the License. */
   }
 #define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_GRAD_CASE(n)                                        \
-  case n: {                                                        \
-    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                         \
+#define EXPAND_GRAD_CASE(n)                                            \
+  case n + 1: {                                                        \
+    ExpandBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                             \
   }
 #define EXPAND_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
@@ -174,7 +182,7 @@ class ExpandV2Kernel : public framework::OpKernel<T> {
     }
 
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -194,9 +202,11 @@ class ExpandV2Kernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -260,7 +270,14 @@ class ExpandV2GradKernel : public framework::OpKernel<T> {
                             "expand_v2_grad op must be less than or equal "
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
-      switch (dims) { REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
+      }
     }
   }
 
@@ -275,20 +292,19 @@ class ExpandV2GradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 8a96d057cbe039..caa29309901932 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -149,6 +149,7 @@ REGISTER_OPERATOR(
 
 REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                        ops::FillConstantKernel<double>,
+                       ops::FillConstantKernel<uint8_t>,
                        ops::FillConstantKernel<int64_t>,
                        ops::FillConstantKernel<int>,
                        ops::FillConstantKernel<bool>,
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
index 78c62a4053b641..e784c20b8b8b4f 100644
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                         ops::FillConstantKernel<double>,
+                        ops::FillConstantKernel<uint8_t>,
                         ops::FillConstantKernel<int64_t>,
                         ops::FillConstantKernel<int>,
                         ops::FillConstantKernel<bool>,
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
new file mode 100644
index 00000000000000..9d5499e00c82f6
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FillConstantNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto str_value = ctx.Attr<std::string>("str_value");
+    auto float_value = ctx.Attr<float>("value");
+
+    auto* out_var = ctx.Output<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    T value;
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      // handle NaN/Inf first, which cannot be read from stream.
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<double>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<double>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        if (std::is_same<int64_t, T>::value) {
+          int64_t tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        } else {
+          double tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        }
+      }
+    }
+    auto shape = GetShape(ctx);
+
+    Tensor tensor_tmp(data_type);
+    tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<T>{value}, ctx.device_context(), &tensor_tmp);
+
+    out_var->mutable_data<T>(shape, place);
+    auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
+                              {{"dims", framework::vectorize(shape)}});
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    fill_constant,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index d23beea7e4e62e..c94ce4174f2be3 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -429,6 +429,7 @@ REGISTER_OPERATOR(flatten_contiguous_range_grad,
 REGISTER_OP_CPU_KERNEL(
     flatten, ops::FlattenKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FlattenKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, int>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -436,12 +437,14 @@ REGISTER_OP_CPU_KERNEL(
     flatten_grad,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     flatten2, ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, float>,
     ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int>,
     ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -449,6 +452,7 @@ REGISTER_OP_CPU_KERNEL(
     flatten2_grad,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -458,6 +462,8 @@ REGISTER_OP_CPU_KERNEL(
                                       float>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
                                       double>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      uint8_t>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
                                       int8_t>,
@@ -469,6 +475,8 @@ REGISTER_OP_CPU_KERNEL(
                                           float>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
                                           double>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          uint8_t>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
                                           int>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc
index 40fda804eaab9d..223cfc6ac667de 100644
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
@@ -19,6 +19,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     flatten, ops::FlattenKernel<paddle::platform::CUDADeviceContext, float>,
     ops::FlattenKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FlattenKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::FlattenKernel<paddle::platform::CUDADeviceContext, int>,
     ops::FlattenKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::FlattenKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -26,12 +27,14 @@ REGISTER_OP_CUDA_KERNEL(
     flatten_grad,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     flatten2, ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, float>,
     ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int>,
     ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -39,6 +42,7 @@ REGISTER_OP_CUDA_KERNEL(
     flatten2_grad,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -48,6 +52,8 @@ REGISTER_OP_CUDA_KERNEL(
                                       float>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
                                       double>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      uint8_t>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
                                       int8_t>,
@@ -59,6 +65,8 @@ REGISTER_OP_CUDA_KERNEL(
                                           float>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
                                           double>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          uint8_t>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
                                           int>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 33d408582ff485..c9ba7a61e0907f 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -200,13 +200,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-#if CUDNN_VERSION >= 11000
+#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     if (!platform::allow_tf32_cudnn) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
                                                          CUDNN_FMA_MATH));
     }
-#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
 
     auto x_dims = framework::vectorize(transformed_input.dims());
     auto f_dims = framework::vectorize(filter->dims());
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index c448c529f56915..b3796f1df5fdf2 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -153,13 +153,13 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                          CUDNN_DEFAULT_MATH));
-#if CUDNN_VERSION >= 11000
+#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
       if (!platform::allow_tf32_cudnn) {
         PADDLE_ENFORCE_CUDA_SUCCESS(
             platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                            CUDNN_FMA_MATH));
       }
-#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     }
     in_dims[2][1] *= 2;
     in_strides[2][0] = oc * h * w;
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 3c82be2c4e48d8..6cca6b5a9729a7 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -249,6 +249,18 @@ void FusionLSTMOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<float>("Scale_data",
+                 "Scale to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Shift_data",
+                 "Shift to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(0.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
   AddAttr<bool>("force_fp32_output",
                 "(bool, default false) Force INT8 kernel output FP32, only "
                 "used in MKL-DNN INT8")
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index cf39968a9004f9..1adbd5cd9e7bc5 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -79,13 +79,11 @@ class LSTMMKLDNNHandler
                                    MKLDNNMemoryFormat::ldgo);
       auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
                                      MKLDNNMemoryFormat::tnc);
+
       auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
                                  MKLDNNMemoryFormat::ldnc);
-      auto c0_md = MKLDNNMemDesc(
-          {L, D, N, OC}, MKLDNNGetDataType<float>(),  // Vanilla LSTM and LSTM
-                                                      // with peepoles has c0 as
-                                                      // fp32
-          MKLDNNMemoryFormat::ldnc);
+      auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<float>(),
+                                 MKLDNNMemoryFormat::ldnc);
 
       // Create LSTM oneDNN primitive
       const auto direction =
@@ -266,7 +264,7 @@ class LSTMMKLDNNHandler
           this->fwd_pd_->src_iter_c_desc(), this->engine_);
 
       auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-      dnnl::reorder(user_c0_memory, *memory_p, this->attr_)
+      dnnl::reorder(user_c0_memory, *memory_p)
           .execute(astream, user_c0_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(c0_key, memory_p);
@@ -360,6 +358,12 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
       weight_h_memory_p =
           handler.template AcquireWeightHMemory<paddle::platform::bfloat16>(
               weight_h);
+    } else {
+      h0_memory_p = handler.template AcquireH0Memory<uint8_t>(h0);
+      weight_x_memory_p =
+          handler.template AcquireWeightXMemory<int8_t>(weight_x);
+      weight_h_memory_p =
+          handler.template AcquireWeightHMemory<int8_t>(weight_h);
     }
 
     auto bias_memory_p = handler.AcquireBiasMemory(bias);
@@ -406,4 +410,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace,
                    ops::FusionLSTMMKLDNNKernel<float>,
-                   ops::FusionLSTMMKLDNNKernel<paddle::platform::bfloat16>);
+                   ops::FusionLSTMMKLDNNKernel<paddle::platform::bfloat16>,
+                   ops::FusionLSTMMKLDNNKernel<uint8_t>);
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
new file mode 100644
index 00000000000000..8a487234ad94ac
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/kron_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class GatherOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto runner = NpuOpRunner("Gather", {*x, *index}, {*out},
+                              {{"validate_indices", true}});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GatherGradOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    // step1: Unsqueeze index
+    framework::Tensor tmp_tensor(index->type());
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {index_dims[0], 1};
+      tmp_tensor.Resize(framework::make_ddim(new_dim));
+      index = &tmp_tensor;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // step2: ZerosLike x in device
+    Tensor zeroslike_xout(x->type());
+    zeroslike_xout.Resize(x->dims());
+    auto p = zeroslike_xout.mutable_data<T>(ctx.GetPlace());
+
+    platform::NPUMemsetAsync(static_cast<void *>(p), 0,
+                             zeroslike_xout.numel() * sizeof(T), stream);
+
+    // step3: scatter(x_grad)
+    dx->mutable_data<T>(ctx.GetPlace());
+    auto runner_scatter = NpuOpRunner(
+        "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
+    runner_scatter.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    gather, ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    gather_grad,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
new file mode 100644
index 00000000000000..de067e45585d91
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/gather_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(gather);
+USE_OP_DEVICE_KERNEL(gather, NPU);
+USE_OP(gather_grad);
+USE_OP_DEVICE_KERNEL(gather_grad, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto index = scope->Var("Index");
+  auto tensor_index = index->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 1; i < 7; ++i) {
+    // 1,2,3,4,5,6
+    init_x.push_back(static_cast<T>(i));
+  }
+
+  // [[1, 2],[3, 4],[5, 6]]
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
+
+  std::vector<int> init_index = {1, 2};
+  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
+  tensor_index->Resize(paddle::framework::make_ddim({2}));
+
+  ctx.Wait();
+
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs = {{"validate_indices", true}};
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather
+  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
+  }
+  uint32_t expected_size = 4;
+  EXPECT_EQ((uint32_t)out_vec.size(), expected_size);
+
+  // {3, 4, 5, 6}
+  std::vector<T> expected_out_vec;
+  for (int64_t i = 3; i < 7; ++i) {
+    expected_out_vec.push_back(static_cast<T>(i));
+  }
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], expected_out_vec[i]);
+  }
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+                 std::string op_type) {
+  // init
+  auto index = scope->Var("Index");
+  auto tensor_index = index->GetMutable<f::LoDTensor>();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  std::vector<int> init_index = {0, 1};
+  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
+  tensor_index->Resize(paddle::framework::make_ddim({2}));
+
+  std::vector<T> init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
+
+  std::vector<T> init_dout = {5.0, 10.0, 2.0, 3.0};
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize(paddle::framework::make_ddim({2, 2}));
+
+  ctx.Wait();
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}},
+      {{"X@GRAD", {"DX"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  ctx.Wait();
+
+  uint32_t expected_size = 3 * 2;
+  EXPECT_EQ((uint32_t)dx_vec.size(), expected_size);
+
+  std::vector<T> expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0};
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    VLOG(3) << "dx_vec[i]=" << dx_vec[i];
+    EXPECT_EQ(dx_vec[i], expected_dx_vec[i]);
+  }
+}
+
+TEST(gather, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "gather");
+}
+
+TEST(gather, NPU_fp16) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<p::float16>(&scope, ctx, "gather");
+}
+
+TEST(gather_grad, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx, "gather_grad");
+}
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
new file mode 100644
index 00000000000000..56aa509177cfd3
--- /dev/null
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/gelu_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class GeluNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GeluGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor out(x->type());
+    out.mutable_data<T>(x->dims(), place);
+    auto out_runner = NpuOpRunner("Gelu", {*x}, {out}, {});
+    out_runner.Run(stream);
+
+    auto dx_runner = NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    gelu, ops::GeluNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GeluNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    gelu_grad,
+    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
new file mode 100644
index 00000000000000..f11812ce3bb219
--- /dev/null
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(gelu);
+USE_OP_DEVICE_KERNEL(gelu, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attrs;
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+
+  auto op = f::OpRegistry::CreateOp("gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}},
+                                    attrs);
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  // eval time
+  struct timeval start, end;
+  gettimeofday(&start, NULL);
+
+  for (int i = 0; i < 100; i++) {
+    op->Run(*scope, place);
+  }
+
+  ctx.Wait();
+
+  gettimeofday(&end, NULL);
+  int micros =
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
+  printf("used time: %d\n", micros / 100);
+
+  // eval value
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  float expected = 0.841192;
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_FLOAT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_dout;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_dout.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize({10, 10});
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attrs;
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+
+  auto op = f::OpRegistry::CreateOp("gelu_grad",
+                                    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}},
+                                    {{"X@GRAD", {"DX"}}}, attrs);
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  // eval time
+  struct timeval start, end;
+  gettimeofday(&start, NULL);
+
+  for (int i = 0; i < 100; i++) {
+    op->Run(*scope, place);
+  }
+
+  ctx.Wait();
+
+  gettimeofday(&end, NULL);
+  int micros =
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
+  printf("used time: %d\n", micros / 100);
+
+  // eval value
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  float expected = 1.082964;
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    EXPECT_FLOAT_EQ(dx_vec[i], static_cast<T>(expected));
+  }
+}
+
+TEST(gelu, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
+
+TEST(gelu_grad, NPU) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 45d97723a3e210..18a248f55314f7 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -174,7 +174,11 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
                                                    : x_dims[1] * x_dims[2]);
 
+#ifdef __HIPCC__
+    int block_size = std::max(std::min(256, imsize), 64);
+#else
     int block_size = std::min(1024, imsize);
+#endif
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
@@ -348,7 +352,11 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
                                                    : x_dims[1] * x_dims[2]);
 
+#ifdef __HIPCC__
+    int block_size = std::max(std::min(256, imsize), 64);
+#else
     int block_size = std::min(1024, imsize);
+#endif
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     int flags =
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
new file mode 100644
index 00000000000000..c1859bce02c904
--- /dev/null
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -0,0 +1,70 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IncrementalNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor = context.Input<framework::Tensor>("X");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    float step = context.Attr<float>("step");
+    out_tensor->mutable_data<T>(context.GetPlace());
+
+    Tensor step_tensor(x_tensor->type());
+    std::vector<T> step_vec;
+    step_vec.push_back(static_cast<T>(step));
+    framework::TensorFromVector(step_vec, context.device_context(),
+                                &step_tensor);
+
+    auto runner =
+        NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
+
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    increment,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext,
+                              plat::float16>)
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
new file mode 100644
index 00000000000000..b466ae275dd1c1
--- /dev/null
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(increment);
+USE_OP_DEVICE_KERNEL(increment, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  init.push_back(static_cast<T>(1.0));
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({1});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attr_input = {{"step", static_cast<float>(2.0)}};
+  auto op = f::OpRegistry::CreateOp("increment", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attr_input);
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
+  EXPECT_EQ(out_vec[0], static_cast<T>(3.0));
+}
+
+TEST(increment, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "increment");
+}
+
+TEST(increment, NPU_fp64) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "increment");
+}
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 5fbde701fcef69..0c90a3869a2a20 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -32,9 +32,11 @@ inline std::vector<int> get_new_shape(
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(), framework::make_ddim({1}),
-        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
+                      platform::errors::InvalidArgument(
+                          "The shape of dimension tensor should be [1],"
+                          "but received d%.",
+                          tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 3362f2474fe252..cfbe1778c76646 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -672,6 +672,8 @@ REGISTER_OP_CPU_KERNEL(bilinear_interp_v2_grad,
                        ops::InterpolateV2GradKernel<double>);
 REGISTER_OP_CPU_KERNEL(nearest_interp_v2, ops::InterpolateV2Kernel<float>,
                        ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<int>,
+                       ops::InterpolateV2Kernel<int64_t>,
                        ops::InterpolateV2Kernel<uint8_t>);
 REGISTER_OP_CPU_KERNEL(nearest_interp_v2_grad,
                        ops::InterpolateV2GradKernel<float>,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 90abcaa8b472a9..e5002e72d0edd7 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -12,6 +12,8 @@
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/interpolate_v2_op.h"
+#include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_launch_config.h"
 
@@ -302,81 +304,214 @@ __global__ void KeBilinearInterpFw(
 }
 
 template <typename T>
-__global__ void KeBilinearInterpBw(
-    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
-    const size_t input_w, const T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const T ratio_h, const T ratio_w,
-    const bool align_corners, const int align_mode,
-    const DataLayout data_layout) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
+__forceinline__ __device__ void PreCalculatorForInputIndex(
+    int* in_img_idx, int* in_img_idy, int* w_id, int* h_id, T* w1lambda,
+    T* h1lambda, T* w2lambda, T* h2lambda, T src_w, T src_h, const int in_img_w,
+    const int in_img_h) {
+  src_w = (src_w > 0) ? src_w : 0.f;
+  src_h = (src_h > 0) ? src_h : 0.f;
+  *in_img_idx = static_cast<int>(src_w);
+  *in_img_idy = static_cast<int>(src_h);
+  *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0;
+  *h_id = (*in_img_idy < in_img_h - 1) ? 1 : 0;
+  *w1lambda = src_w - *in_img_idx;
+  *h1lambda = src_h - *in_img_idy;
+  *w2lambda = 1.f - *w1lambda;
+  *h2lambda = 1.f - *h1lambda;
+}
 
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
+/* Calculate the minimum of partial elements in a block */
+template <typename T>
+__inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block,
+                                        unsigned mask) {
+  __shared__ T shared[WARP_SIZE];
+  __shared__ T shared_last_val;
+  __shared__ int shared_last_idx;
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+  int threshold = (threads_num_in_block & (-WARP_SIZE));
+
+  if (threadIdx.x < threshold) {
+    shared_last_idx = (threshold >> 5) - 1;
+    val = math::warpReduceMin(val, mask);
+    if (lane == 0) {
+      shared[wid] = val;
     }
+  } else {
+    shared_last_val = std::numeric_limits<T>::max();
+    platform::CudaAtomicMin(&shared_last_val, val);
+    shared[wid] = shared_last_val;
+    shared_last_idx = wid;
+  }
+  __syncthreads();
 
-    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
-                                : ratio_h * out_img_idy;
-    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
+  if (threadIdx.x < threshold) {
+    val = (lane <= shared_last_idx) ? shared[lane]
+                                    : std::numeric_limits<T>::max();
+    val = math::warpReduceMin(val, mask);
+    shared_last_val = val;
+  }
+  __syncthreads();
+  if (threadIdx.x >= threshold) {
+    val = shared_last_val;
+  }
+  return val;
+}
 
-    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
-                                : ratio_w * out_img_idx;
-    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
+template <typename T>
+__global__ void KeBilinearInterpBwShareMemory(
+    T* in, const int in_h, const int in_w, const T* __restrict__ out,
+    const int out_h, const int out_w, const int n, const int num_channels,
+    float ratio_h, float ratio_w, const T align_type_value, bool is_nchw) {
+  __shared__ T s_data[2][1024];
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int in_chw = in_h * in_w * num_channels;
+  int out_chw = num_channels * out_h * out_w;
+  int nthreads = n * out_chw;
 
-    T* in_pos;
-    if (data_layout == DataLayout::kNCHW) {
-      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                   in_img_idy * in_img_w + in_img_idx];
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / out_chw;
+    int out_id_w = tid % out_chw;
+    const int in_img_size = in_h * in_w;
+    const int out_img_size = out_h * out_w;
+    T value = out[out_id_h * out_chw + out_id_w];
+
+    int channel_id = out_id_w / out_img_size;
+    int out_img_idy = (out_id_w % out_img_size) / out_w;
+    int out_img_idx = tid % out_w;
+
+    int in_img_idx, in_img_idy, w_id, h_id;
+    T w1lambda, h1lambda, w2lambda, h2lambda;
+    T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+    T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+    PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id,
+                               &w1lambda, &h1lambda, &w2lambda, &h2lambda,
+                               src_w, src_h, in_w, in_h);
+
+    // top_left_index is just input_index.
+    int input_index = out_id_h * in_chw + channel_id * in_img_size +
+                      in_img_idy * in_w + in_img_idx;
+    int top_right_index = input_index + w_id;
+    int bot_left_index = input_index + h_id * in_w;
+    int bot_right_index = input_index + h_id * in_w + w_id;
+    int in_top_min_index, in_bot_min_index;
+
+    s_data[0][threadIdx.x] = 0.f;
+    s_data[1][threadIdx.x] = 0.f;
+    int remain = nthreads - (tid & (-blockDim.x));
+    int in_top_max_index = math::blockReduceMax(top_right_index, FINAL_MASK);
+    int in_bot_max_index = math::blockReduceMax(bot_right_index, FINAL_MASK);
+
+    if (remain > blockDim.x) {
+      in_top_min_index = math::blockReduceMin(input_index, FINAL_MASK);
+      in_bot_min_index = math::blockReduceMin(bot_left_index, FINAL_MASK);
     } else {
-      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-                   in_img_idx * num_channels + channel_id];
+      in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK);
+      in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK);
     }
+    int upper_limit_share_idx = (in_top_max_index - in_top_min_index) >
+                                        (in_bot_max_index - in_bot_min_index)
+                                    ? (in_top_max_index - in_top_min_index)
+                                    : (in_bot_max_index - in_bot_min_index);
+    if (h_id != 0) {
+      platform::CudaAtomicAdd(&s_data[0][input_index - in_top_min_index],
+                              h2lambda * w2lambda * value);
+      platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index],
+                              h2lambda * w1lambda * value);
+      platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index],
+                              h1lambda * w2lambda * value);
+      platform::CudaAtomicAdd(&s_data[1][bot_right_index - in_bot_min_index],
+                              h1lambda * w1lambda * value);
+    } else {
+      platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index],
+                              (h2lambda + h1lambda) * w1lambda * value);
+      platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index],
+                              (h1lambda + h2lambda) * w2lambda * value);
+    }
+    __syncthreads();
 
-    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+    if (threadIdx.x <= upper_limit_share_idx) {
+      platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x],
+                              s_data[0][threadIdx.x]);
+      platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x],
+                              s_data[1][threadIdx.x]);
+    }
+  }
+}
 
-    if (data_layout == DataLayout::kNCHW) {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
-                              h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
-                              h1lambda * w1lambda * out_pos[0]);
-    } else {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+template <typename T>
+__global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w,
+                                   const T* __restrict__ out, const int out_h,
+                                   const int out_w, const int n,
+                                   const int num_channels, float ratio_h,
+                                   float ratio_w, const T align_type_value,
+                                   bool is_nchw) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int in_chw = in_h * in_w * num_channels;
+  int out_chw = num_channels * out_h * out_w;
+  int nthreads = n * out_chw;
+
+  if (is_nchw) {
+    for (; tid < nthreads; tid += stride) {
+      int out_id_h = tid / out_chw;
+      int out_id_w = tid % out_chw;
+      const int in_img_size = in_h * in_w;
+      const int out_img_size = out_h * out_w;
+      T value = out[out_id_h * out_chw + out_id_w];
+
+      int channel_id = out_id_w / out_img_size;
+      int out_img_idy = (out_id_w % out_img_size) / out_w;
+      int out_img_idx = tid % out_w;
+      int in_img_idx, in_img_idy, w_id, h_id;
+      T w1lambda, h1lambda, w2lambda, h2lambda;
+
+      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+      PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id,
+                                 &w1lambda, &h1lambda, &w2lambda, &h2lambda,
+                                 src_w, src_h, in_w, in_h);
+
+      T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size +
+                      in_img_idy * in_w + in_img_idx];
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
+      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_w],
+                              h1lambda * w2lambda * value);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id],
+                              h1lambda * w1lambda * value);
+    }
+  } else {
+    for (; tid < nthreads; tid += stride) {
+      int out_id_h = tid / out_chw;
+      int out_id_w = tid % out_chw;
+      const int in_img_size = in_h * in_w;
+      const int out_img_size = out_h * out_w;
+      T value = out[out_id_h * out_chw + out_id_w];
+
+      int out_img_idy = out_id_w / (out_w * num_channels);
+      int out_img_idx = out_id_w % (out_w * num_channels) / num_channels;
+      int channel_id = tid % num_channels;
+
+      int in_img_idx, in_img_idy, w_id, h_id;
+      T w1lambda, h1lambda, w2lambda, h2lambda;
+      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+      PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id,
+                                 &w1lambda, &h1lambda, &w2lambda, &h2lambda,
+                                 src_w, src_h, in_w, in_h);
+
+      T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
+                      in_img_idx * num_channels + channel_id];
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
       platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
-                              h1lambda * w2lambda * out_pos[0]);
+                              h2lambda * w1lambda * value);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
+                              h1lambda * w2lambda * value);
       platform::CudaAtomicAdd(
-          &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
-          h1lambda * w1lambda * out_pos[0]);
+          &in_pos[h_id * in_w * num_channels + w_id * num_channels],
+          h1lambda * w1lambda * value);
     }
   }
 }
@@ -1373,7 +1508,6 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   int out_hw = out_h * out_w;
   int in_chw = c * in_hw;
   int out_chw = c * out_hw;
-
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
@@ -1386,11 +1520,25 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                            ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
-        data_layout);
+    const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0;
+    bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false;
+    bool optimize_flag = false;
+    optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6))
+                        ? true
+                        : ((in_h == 1 && in_w == 1) ? true : false);
+
+    if (optimize_flag & is_nchw) {
+      KeBilinearInterpBwShareMemory<
+          T><<<config.block_per_grid, config.thread_per_block, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
+          ratio_h, ratio_w, align_type_value, is_nchw);
+    } else {
+      KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
+                              ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
+          ratio_h, ratio_w, align_type_value, is_nchw);
+    }
   } else if ("bicubic" == interp_method) {
     KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
                            ctx.cuda_device_context().stream()>>>(
@@ -1590,6 +1738,7 @@ REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2_grad,
 REGISTER_OP_CUDA_KERNEL(nearest_interp_v2,
                         ops::InterpolateOpV2CUDAKernel<float>,
                         ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int64_t>,
                         ops::InterpolateOpV2CUDAKernel<int>);
 REGISTER_OP_CUDA_KERNEL(nearest_interp_v2_grad,
                         ops::InterpolateV2GradOpCUDAKernel<float>,
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 4e4fd9ff63ba47..ebab5794edc517 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -32,9 +32,11 @@ inline std::vector<int> get_new_shape(
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(), framework::make_ddim({1}),
-        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
+                      platform::errors::InvalidArgument(
+                          "The shape of dimension tensor should be [1],"
+                          "but received d%.",
+                          tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
@@ -795,16 +797,22 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale_w = scale_data[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     } else {
       if (scale.size() > 0) {
         scale_w = scale[0];
 
-        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                                 "scale  of Op(interpolate) "
-                                                 "should be greater than 0."));
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
       }
     }
     if (scale_w > 0.) {
@@ -882,18 +890,34 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale_data[0];
       }
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     } else {
       if (scale.size() > 1) {
         scale_h = scale[0];
         scale_w = scale[1];
 
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
       }
     }
     if (scale_h > 0. && scale_w > 0.) {
@@ -998,9 +1022,23 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale_data[0];
       }
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     } else {
       if (scale.size() > 1) {
         scale_d = scale[0];
@@ -1008,9 +1046,23 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale[2];
 
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0 && scale_d, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
+        PADDLE_ENFORCE_EQ(
+            scale_d > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_d in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_d));
       }
     }
     if (scale_w > 0. && scale_h > 0. && scale_d > 0.) {
@@ -1102,15 +1154,21 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale_w = scale_data[0];
-    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                             "scale  of Op(interpolate) "
-                                             "should be greater than 0."));
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
   } else {
     if (scale.size() > 0) {
       scale_w = scale[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     }
   }
   if (scale_w > 0.) {
@@ -1188,17 +1246,33 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
       scale_h = scale_data[0];
     }
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
   } else {
     if (scale.size() > 1) {
       scale_h = scale[0];
       scale_w = scale[1];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     }
   }
   if (scale_h > 0. && scale_w > 0.) {
@@ -1301,18 +1375,46 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
       scale_w = scale_data[0];
     }
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
+    PADDLE_ENFORCE_EQ(
+        scale_d > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_d));
   } else {
     if (scale.size() > 1) {
       scale_d = scale[0];
       scale_h = scale[1];
       scale_w = scale[2];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     }
   }
   if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 419c4d44b6d364..a8e441a96717df 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -330,7 +330,10 @@ void BenchKernelSgd() {
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
     }
-    std::random_shuffle(all.begin(), all.end());
+    std::random_device rnd;
+    int64_t seed_tmp = rnd();
+    std::default_random_engine rng(seed_tmp);
+    std::shuffle(all.begin(), all.end(), rng);
     out.insert(out.begin(), all.begin(), all.begin() + n);
     return out;
   };
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index cfddbf213ef73e..ff68565637c5a9 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -861,7 +861,10 @@ void TestKernelSgd() {
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
     }
-    std::random_shuffle(all.begin(), all.end());
+    std::random_device rnd;
+    int64_t seed_tmp = rnd();
+    std::default_random_engine rng(seed_tmp);
+    std::shuffle(all.begin(), all.end(), rng);
     out.insert(out.begin(), all.begin(), all.begin() + n);
     return out;
   };
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
new file mode 100644
index 00000000000000..95549319cd2096
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -0,0 +1,390 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+class NormDataType;
+
+template <>
+class NormDataType<platform::float16> {
+ public:
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+};
+
+template <>
+class NormDataType<float> {
+ public:
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+};
+
+template <typename T>
+using NormDataType = NormDataType<T>;
+template <typename T>
+using LayerNormParamType = typename NormDataType<T>::BatchNormParamType;
+
+template <typename T>
+class LayerNormNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using U = LayerNormParamType<T>;
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* variance = ctx.Output<Tensor>("Variance");
+    const auto& x_dims = x->dims();
+    std::vector<int> axes;
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    // The shape of scale and bias should be equal to x.shape[begin_norm_axis:],
+    // required by Ascend.
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      axes.push_back(x_dims[i]);
+    }
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor default_scale(x->type());
+    if (!scale) {
+      default_scale.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      TensorFromVector(std::vector<T>{static_cast<T>(1.0)},
+                       ctx.device_context(), &value);
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
+      runner.Run(stream);
+      scale = &default_scale;
+    } else {
+      const_cast<Tensor*>(scale)->Resize(framework::make_ddim(axes));
+    }
+
+    Tensor default_bias(x->type());
+    if (!bias) {
+      default_bias.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      TensorFromVector(std::vector<T>{static_cast<T>(0)}, ctx.device_context(),
+                       &value);
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
+      runner.Run(stream);
+      bias = &default_bias;
+    } else {
+      const_cast<Tensor*>(bias)->Resize(framework::make_ddim(axes));
+    }
+
+    // cast scale from LayerNormParamType to T if needed
+    Tensor cast_scale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        scale->type() == framework::proto::VarType::FP32) {
+      cast_scale.Resize(scale->dims());
+      cast_scale.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_scale =
+          NpuOpRunner("Cast", {*scale}, {cast_scale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_scale.Run(stream);
+    } else {
+      cast_scale.ShareDataWith(*scale);
+    }
+
+    // cast bias from LayerNormParamType to T if needed
+    Tensor cast_bias(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        bias->type() == framework::proto::VarType::FP32) {
+      cast_bias.Resize(bias->dims());
+      cast_bias.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_bias =
+          NpuOpRunner("Cast", {*bias}, {cast_bias},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_bias.Run(stream);
+    } else {
+      cast_bias.ShareDataWith(*bias);
+    }
+
+    y->mutable_data<T>(ctx.GetPlace());
+
+    // mean should be of  U type
+    Tensor* tmp_mean = mean;
+    Tensor cast_mean(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (scale->type() == framework::proto::VarType::FP32 ||
+         bias->type() == framework::proto::VarType::FP32)) {
+      cast_mean.Resize(mean->dims());
+      cast_mean.mutable_data<T>(ctx.GetPlace());
+      tmp_mean = &cast_mean;
+      mean->mutable_data<U>(ctx.GetPlace());
+    } else {
+      mean->mutable_data<T>(ctx.GetPlace());
+    }
+
+    // same for variance
+    Tensor* tmp_variance = variance;
+    Tensor cast_variance(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (scale->type() == framework::proto::VarType::FP32 ||
+         bias->type() == framework::proto::VarType::FP32)) {
+      cast_variance.Resize(variance->dims());
+      cast_variance.mutable_data<T>(ctx.GetPlace());
+      tmp_variance = &cast_variance;
+      variance->mutable_data<U>(ctx.GetPlace());
+    } else {
+      variance->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias},
+                              {*y, *tmp_mean, *tmp_variance},
+                              {{"begin_norm_axis", begin_norm_axis},
+                               {"begin_params_axis", begin_norm_axis},
+                               {"epsilon", epsilon}});
+    runner.Run(stream);
+
+    // cast back from FP16 to FP32
+    if (x->type() == framework::proto::VarType::FP16 &&
+        mean->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(mean->type());
+      auto runner_cast_mean =
+          NpuOpRunner("Cast", {*tmp_mean}, {*mean},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_mean.Run(stream);
+    }
+    // same for variance
+    if (x->type() == framework::proto::VarType::FP16 &&
+        variance->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(variance->type());
+      auto runner_cast_variance =
+          NpuOpRunner("Cast", {*tmp_variance}, {*variance},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_variance.Run(stream);
+    }
+
+    // revert shape of scale and bias
+    // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input
+    // tensor.
+    const_cast<Tensor*>(scale)->Resize(framework::make_ddim({right}));
+    const_cast<Tensor*>(bias)->Resize(framework::make_ddim({right}));
+  }
+};
+
+template <typename T>
+class LayerNormGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using U = LayerNormParamType<T>;
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto& x_dims = x->dims();
+    const auto* mean = ctx.Input<Tensor>("Mean");
+    const auto* variance = ctx.Input<Tensor>("Variance");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    std::vector<int> axes;
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      axes.push_back(x_dims[i]);
+    }
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // No need to compute any gradient, jusr return
+    if (!dx && !dscale && !dbias) {
+      return;
+    }
+
+    // The rank of mean should be equal to x, required by Ascend.
+    std::vector<int> new_shape;
+    for (auto i = 0; i < begin_norm_axis; ++i) {
+      new_shape.push_back(x_dims[i]);
+    }
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      new_shape.push_back(1);
+    }
+
+    auto mean_dims = mean->dims();
+    const_cast<Tensor*>(mean)->Resize(framework::make_ddim({new_shape}));
+    const_cast<Tensor*>(variance)->Resize(framework::make_ddim({new_shape}));
+
+    Tensor default_scale(x->type());
+    if (!scale) {
+      default_scale.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      TensorFromVector(std::vector<T>{static_cast<T>(1.0)},
+                       ctx.device_context(), &value);
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
+      runner.Run(stream);
+      scale = &default_scale;
+    } else {
+      const_cast<Tensor*>(scale)->Resize(framework::make_ddim(axes));
+    }
+
+    // cast scale from LayerNormParamType to T if needed
+    Tensor cast_scale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        scale->type() == framework::proto::VarType::FP32) {
+      cast_scale.Resize(scale->dims());
+      cast_scale.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_scale =
+          NpuOpRunner("Cast", {*scale}, {cast_scale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_scale.Run(stream);
+    } else {
+      cast_scale.ShareDataWith(*scale);
+    }
+
+    // cast mean from LayerNormParamType to T if needed
+    Tensor cast_mean(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        mean->type() == framework::proto::VarType::FP32) {
+      cast_mean.Resize(mean->dims());
+      cast_mean.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_mean =
+          NpuOpRunner("Cast", {*mean}, {cast_mean},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_mean.Run(stream);
+    } else {
+      cast_mean.ShareDataWith(*mean);
+    }
+
+    // cast variance from LayerNormParamType to T if needed
+    Tensor cast_variance(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        variance->type() == framework::proto::VarType::FP32) {
+      cast_variance.Resize(variance->dims());
+      cast_variance.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_variance =
+          NpuOpRunner("Cast", {*variance}, {cast_variance},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_variance.Run(stream);
+    } else {
+      cast_variance.ShareDataWith(*variance);
+    }
+
+    Tensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
+    dx = (dx == nullptr) ? &dx_ : dx;
+    dscale = (dscale == nullptr) ? &dscale_ : dscale;
+    dbias = (dbias == nullptr) ? &dbias_ : dbias;
+
+    dx->Resize(x->dims());
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    dscale->Resize(framework::make_ddim(axes));
+
+    dbias->Resize(framework::make_ddim(axes));
+
+    // dscale should be of  U type
+    Tensor* tmp_dscale = dscale;
+    Tensor cast_dscale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (mean->type() == framework::proto::VarType::FP32 ||
+         variance->type() == framework::proto::VarType::FP32)) {
+      cast_dscale.Resize(dscale->dims());
+      cast_dscale.mutable_data<T>(ctx.GetPlace());
+      tmp_dscale = &cast_dscale;
+      dscale->mutable_data<U>(ctx.GetPlace());
+    } else {
+      dscale->mutable_data<T>(ctx.GetPlace());
+    }
+
+    // same for dbias
+    Tensor* tmp_dbias = dbias;
+    Tensor cast_dbias(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (mean->type() == framework::proto::VarType::FP32 ||
+         variance->type() == framework::proto::VarType::FP32)) {
+      cast_dbias.Resize(dbias->dims());
+      cast_dbias.mutable_data<T>(ctx.GetPlace());
+      tmp_dbias = &cast_dbias;
+      dbias->mutable_data<U>(ctx.GetPlace());
+    } else {
+      dbias->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto runner = NpuOpRunner("LayerNormGrad",
+                              {*dy, *x, cast_variance, cast_mean, cast_scale},
+                              {*dx, *tmp_dscale, *tmp_dbias}, {});
+    runner.Run(stream);
+
+    // cast back from FP16 to FP32
+    if (x->type() == framework::proto::VarType::FP16 &&
+        dscale->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(dscale->type());
+      auto runner_cast_dscale =
+          NpuOpRunner("Cast", {*tmp_dscale}, {*dscale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_dscale.Run(stream);
+    }
+    // same for dbias
+    if (x->type() == framework::proto::VarType::FP16 &&
+        dbias->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(dbias->type());
+      auto runner_cast_dbias =
+          NpuOpRunner("Cast", {*tmp_dbias}, {*dbias},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_dbias.Run(stream);
+    }
+
+    const_cast<Tensor*>(mean)->Resize(mean_dims);
+    const_cast<Tensor*>(variance)->Resize(mean_dims);
+    const_cast<Tensor*>(scale)->Resize(framework::make_ddim({right}));
+    dscale->Resize(framework::make_ddim({right}));
+    dbias->Resize(framework::make_ddim({right}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(layer_norm, ops::LayerNormNPUKernel<float>,
+                       ops::LayerNormNPUKernel<plat::float16>);
+REGISTER_OP_NPU_KERNEL(layer_norm_grad, ops::LayerNormGradNPUKernel<float>,
+                       ops::LayerNormGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 02fca246d241d4..e4fe92c625640d 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -12,7 +12,297 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <limits>
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+
+#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two)                \
+  case near_greater_power_of_two:                                            \
+    ComputeLogSoftmaxForwardInWarp<                                          \
+        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
+        dst, src, outer_size, dim_size);                                     \
+    break;
+
+template <typename T, int KernelWarpSize>
+__device__ __forceinline__ T WarpReduceSum(T value) {
+#pragma unroll
+  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
+    T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
+    value = value + sum_val;
+  }
+  return value;
+}
+
+template <typename T, int KernelWarpSize>
+__device__ __forceinline__ T WarpReduceMax(T value) {
+#pragma unroll
+  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
+    T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
+    value = max(value, max_val);
+  }
+  return value;
+}
+
+int GetNearGreaterPowerOfTwo(int value) {
+  int log2_value = 0;
+  while ((1 << log2_value) < value) {
+    ++log2_value;
+  }
+  return 1 << log2_value;
+}
+
+template <typename T, typename AccT, int NearGreaterPowerOfTwo>
+__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
+                                               int batch_size,
+                                               int element_count) {
+  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
+  constexpr int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
+  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
+
+  int thread_in_warp_idx = threadIdx.x;
+
+  // 1.read data from global memory to registers
+  AccT elements[warp_iter];
+  // set effective_element_count as the num of elements when warps do effective
+  // work
+  // set effective_element_count as 0, when warps do ineffective work
+  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
+  for (int it = 0; it < warp_iter; ++it) {
+    int element_index = thread_in_warp_idx + it * kernel_warp_size;
+    if (element_index < effective_element_count) {
+      elements[it] =
+          static_cast<AccT>(src[batch_id * element_count + element_index]);
+    } else {
+      elements[it] = -std::numeric_limits<AccT>::infinity();
+    }
+  }
+
+  // 2.compute max_value. For each thread, loop all registers to find max
+  AccT max_value = elements[0];
+#pragma unroll
+  for (int it = 1; it < warp_iter; ++it) {
+    max_value = (max_value > elements[it]) ? max_value : elements[it];
+  }
+  max_value = WarpReduceMax<AccT, kernel_warp_size>(max_value);
+
+  // 3.For each warp, accumulate all thread registers
+  AccT sum = 0.0f;
+#pragma unroll
+  for (int it = 0; it < warp_iter; ++it) {
+    sum += std::exp(elements[it] - max_value);
+  }
+  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
+
+  // 4.store result.
+  sum = std::log(sum);
+#pragma unroll
+  for (int it = 0; it < warp_iter; ++it) {
+    int element_index = thread_in_warp_idx + it * kernel_warp_size;
+    if (element_index < element_count) {
+      dst[batch_id * element_count + element_index] =
+          static_cast<T>(elements[it] - max_value - sum);
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename T, typename AccT>
+void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size,
+                                     int outer_size, gpuStream_t stream) {
+  int threads_per_block = 128;
+  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
+  int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  int warps_per_block = (threads_per_block / kernel_warp_size);
+  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
+  dim3 threads(kernel_warp_size, warps_per_block, 1);
+
+  switch (near_greater_power_of_two) {
+    LAUNCH_WARP_FORWAR_COMPUTE(1);
+    LAUNCH_WARP_FORWAR_COMPUTE(2);
+    LAUNCH_WARP_FORWAR_COMPUTE(4);     // dim_size: 3~4
+    LAUNCH_WARP_FORWAR_COMPUTE(8);     // dim_size: 5~8
+    LAUNCH_WARP_FORWAR_COMPUTE(16);    // dim_size: 9~16
+    LAUNCH_WARP_FORWAR_COMPUTE(32);    // dim_size: 17~32
+    LAUNCH_WARP_FORWAR_COMPUTE(64);    // dim_size: 33~64
+    LAUNCH_WARP_FORWAR_COMPUTE(128);   // dim_size 65~128
+    LAUNCH_WARP_FORWAR_COMPUTE(256);   // dim_size 129~256
+    LAUNCH_WARP_FORWAR_COMPUTE(512);   // dim_size 257~512
+    LAUNCH_WARP_FORWAR_COMPUTE(1024);  // dim_size 513~1024
+
+    default:
+      break;
+  }
+}
+
+template <typename T>
+class LogSoftmaxKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const auto *x = context.Input<framework::Tensor>("X");
+    auto *out = context.Output<framework::Tensor>("Out");
+    const auto *input_data = x->data<T>();
+    auto *output_data = out->mutable_data<T>(context.GetPlace());
+
+    const int rank = x->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    int dim_size = x->dims()[axis];
+    int inner_size = 1;
+    for (int i = axis + 1; i < x->dims().size(); ++i) {
+      inner_size *= x->dims()[i];
+    }
+    int outer_size = SizeToAxis(axis, x->dims());
+    gpuStream_t stream = context.cuda_device_context().stream();
+
+    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
+      LaunchSoftmaxForwardForLastAxis<T, MPDType>(output_data, input_data,
+                                                  dim_size, outer_size, stream);
+    } else {
+      LogSoftmaxFunctor<platform::CUDADeviceContext, T>()(
+          context.template device_context<platform::CUDADeviceContext>(), x,
+          out, axis);
+    }
+  }
+};
+
+// Backward below
+#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two)              \
+  case near_greater_power_of_two:                                            \
+    ComputeLogSoftmaxBackwardInWarp<                                         \
+        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
+        output, grad_output, grad_input, outer_size, dim_size);              \
+    break;
+
+template <typename T, typename AccT, int NearGreaterPowerOfTwo>
+__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output,
+                                                const T *grad_output,
+                                                T *grad_input, int batch_size,
+                                                int element_count) {
+  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
+  constexpr int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
+  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
+
+  int thread_in_warp_idx = threadIdx.x % kernel_warp_size;
+
+  // 1.read data from global memory to registers
+  AccT output_register[warp_iter];
+  AccT grad_output_register[warp_iter];
+  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
+  for (int iter = 0; iter < warp_iter; ++iter) {
+    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
+    if (element_index < effective_element_count) {
+      output_register[iter] =
+          static_cast<AccT>(output[batch_id * element_count + element_index]);
+      grad_output_register[iter] = static_cast<AccT>(
+          grad_output[batch_id * element_count + element_index]);
+    } else {
+      output_register[iter] = AccT(0);
+      grad_output_register[iter] = AccT(0);
+    }
+  }
+
+  // 2. For each warp, accumulate all thread registers
+  AccT sum = grad_output_register[0];
+#pragma unroll
+  for (int iter = 1; iter < warp_iter; ++iter) {
+    sum += grad_output_register[iter];
+  }
+  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
+
+// 3. write result in grad_input
+#pragma unroll
+  for (int iter = 0; iter < warp_iter; ++iter) {
+    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
+    if (element_index < element_count) {
+      grad_input[batch_id * element_count + element_index] = static_cast<T>(
+          (grad_output_register[iter] - std::exp(output_register[iter]) * sum));
+    }
+  }
+}
+
+template <typename T, typename AccT>
+void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
+                                      const T *output, int dim_size,
+                                      int outer_size, gpuStream_t stream) {
+  int threads_per_block = 128;
+  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
+  int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  int warps_per_block = (threads_per_block / kernel_warp_size);
+  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
+  dim3 threads(kernel_warp_size, warps_per_block, 1);
+
+  switch (near_greater_power_of_two) {
+    LAUNCH_WARP_BACKWARD_COMPUTE(1);     // dim_size: 1
+    LAUNCH_WARP_BACKWARD_COMPUTE(2);     // dim_size: 2
+    LAUNCH_WARP_BACKWARD_COMPUTE(4);     // dim_size: 3~4
+    LAUNCH_WARP_BACKWARD_COMPUTE(8);     // dim_size: 5~8
+    LAUNCH_WARP_BACKWARD_COMPUTE(16);    // dim_size: 9~16
+    LAUNCH_WARP_BACKWARD_COMPUTE(32);    // dim_size: 17~32
+    LAUNCH_WARP_BACKWARD_COMPUTE(64);    // dim_size: 33~64
+    LAUNCH_WARP_BACKWARD_COMPUTE(128);   // dim_size: 65~128
+    LAUNCH_WARP_BACKWARD_COMPUTE(256);   // dim_size: 129~256
+    LAUNCH_WARP_BACKWARD_COMPUTE(512);   // dim_size: 257~512
+    LAUNCH_WARP_BACKWARD_COMPUTE(1024);  // dim_size: 513~1024
+
+    default:
+      break;
+  }
+}
+
+template <typename T>
+class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const auto *out = context.Input<framework::Tensor>("Out");
+    const auto *g_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *g_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    const auto *out_data = out->data<T>();
+    const auto *g_out_data = g_out->data<T>();
+    auto *g_x_data = g_x->mutable_data<T>(context.GetPlace());
+
+    const int rank = out->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    int dim_size = out->dims()[axis];
+    int inner_size = 1;
+    for (int i = axis + 1; i < out->dims().size(); ++i) {
+      inner_size *= out->dims()[i];
+    }
+    int outer_size = SizeToAxis(axis, out->dims());
+    gpuStream_t stream = context.cuda_device_context().stream();
+
+    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
+      LaunchSoftmaxBackwardForLastAxis<T, MPDType>(
+          g_x_data, g_out_data, out_data, dim_size, outer_size, stream);
+    } else {
+      LogSoftmaxGradFunctor<platform::CUDADeviceContext, T>()(
+          context.template device_context<platform::CUDADeviceContext>(), out,
+          g_out, g_x, axis);
+    }
+  }
+};
+
+}  // operators
+}  // paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
new file mode 100644
index 00000000000000..fab2d7f7aa0542
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LookupTableV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");      // int tensor
+    auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
+    auto *table_t = ctx.Input<framework::LoDTensor>("W");
+    auto *table_var = ctx.InputVar("W");
+    PADDLE_ENFORCE_EQ(
+        table_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("npu only accept LoDTensor"));
+    output_t->mutable_data<T>(ctx.GetPlace());
+    framework::NPUAttributeMap attr_input = {{"validate_indices", false}};
+
+    auto runner =
+        NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
+    auto *output_grad_t =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *table_grad_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
+    table_grad_t->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // step2: ZerosLike x in device
+    Tensor zeroslike_w(table_grad_t->type());
+    zeroslike_w.Resize(table_grad_t->dims());
+    auto p = zeroslike_w.mutable_data<T>(ctx.GetPlace());
+
+    platform::NPUMemsetAsync(static_cast<void *>(p), 0,
+                             zeroslike_w.numel() * sizeof(T), stream);
+
+    table_grad_t->mutable_data<T>(ctx.GetPlace());
+    auto runner_scatter =
+        NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t},
+                    {*table_grad_t}, {});
+    runner_scatter.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    lookup_table_v2,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext,
+                                paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel<float>,
+    ops::LookupTableV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc
new file mode 100644
index 00000000000000..f37915834bd756
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc
@@ -0,0 +1,142 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <cmath>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(lookup_table_v2);
+USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto ids = scope->Var("Ids");
+  auto out = scope->Var("Out");
+  auto w = scope->Var("W");
+
+  auto ids_t = ids->GetMutable<f::LoDTensor>();
+  auto out_t = out->GetMutable<f::LoDTensor>();
+  auto w_t = w->GetMutable<f::LoDTensor>();
+  int bsz = 10;
+  int dim = 32;
+  int seqlen = 8;
+  int vocab_size = 100;
+  TensorFromVector(std::vector<int64_t>(bsz * seqlen, 3), ctx, ids_t);
+  std::vector<T> val(vocab_size * dim, 10.);
+  TensorFromVector(val, ctx, w_t);
+  ids_t->Resize({bsz, seqlen});
+  w_t->Resize({vocab_size, dim});
+  out_t->Resize({bsz, seqlen, dim});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  out_t->mutable_data<T>(place);
+  f::AttributeMap attrs = {{}};
+  auto op = f::OpRegistry::CreateOp("lookup_table_v2",
+                                    {{"W", {"W"}}, {"Ids", {"Ids"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+  op->Run(*scope, place);
+  std::vector<T> out_v;
+  TensorToVector(*out_t, ctx, &out_v);
+  ctx.Wait();
+  EXPECT_EQ(out_t->numel(), bsz * seqlen * dim);
+  T res = std::accumulate(out_v.begin(), out_v.end(), 0.);
+  float eps = 1.e-6;
+  EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps);
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto w = scope->Var("W");
+  auto ids = scope->Var("Ids");
+  auto out = scope->Var("DOut");
+  auto dw = scope->Var("DW");
+
+  auto w_t = w->GetMutable<f::LoDTensor>();
+  auto ids_t = ids->GetMutable<f::LoDTensor>();
+  auto out_t = out->GetMutable<f::LoDTensor>();
+  auto dw_t = dw->GetMutable<f::LoDTensor>();
+
+  int bsz = 2;
+  int dim = 2;
+  int seqlen = 2;
+  int vocab_size = 4;
+
+  std::vector<int64_t> val_int(bsz * seqlen, 3);
+  std::vector<T> val(vocab_size * dim, 0.);
+  std::vector<T> val_out(bsz * seqlen * dim, 1.);
+
+  TensorFromVector(val_int, ctx, ids_t);
+  TensorFromVector(val, ctx, w_t);
+  TensorFromVector(val, ctx, dw_t);
+  TensorFromVector(val_out, ctx, out_t);
+
+  w_t->Resize({vocab_size, dim});
+  ids_t->Resize({bsz, seqlen});
+  out_t->Resize({bsz, seqlen, dim});
+  dw_t->Resize({vocab_size, dim});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  out_t->mutable_data<T>(place);
+  w_t->mutable_data<T>(place);
+  dw_t->mutable_data<T>(place);
+  f::AttributeMap attrs = {{}};
+  auto op = f::OpRegistry::CreateOp(
+      "lookup_table_v2_grad",
+      {{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}},
+      {{"W@GRAD", {"DW"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::vector<T> w_v;
+  TensorToVector(*dw_t, ctx, &w_v);
+  ctx.Wait();
+  EXPECT_EQ(dw_t->numel(), vocab_size * dim);
+  T res = std::accumulate(w_v.begin(), w_v.end(), 0.);
+  float eps = 1.e-6;
+  EXPECT_LT(fabs(res - bsz * seqlen * dim), eps);
+}
+
+TEST(lookup_table_v2, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
+
+TEST(lookup_table_v2_grad, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index a29997e5654704..d62c1e42d3bc44 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -114,8 +114,8 @@ __global__ void ConcatKernel(const T** inputs_data, const int in_num,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int* out_cols,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t* out_cols,
                             int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   int curr_segment = 0;
@@ -159,15 +159,15 @@ __device__ void SplitKernelDetail(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T** outputs_data) {
   SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1) {
   T* outputs_data[2];
   outputs_data[0] = outputs_addr0;
@@ -176,8 +176,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1,
                             T* outputs_addr2) {
   T* outputs_data[3];
@@ -188,8 +188,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1,
                             T* outputs_addr2, T* outputs_addr3) {
   T* outputs_data[4];
@@ -201,8 +201,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 static inline void GetBlockDims(const platform::CUDADeviceContext& context,
-                                int num_rows, int num_cols, dim3* block_dims,
-                                dim3* grid_dims) {
+                                int64_t num_rows, int64_t num_cols,
+                                dim3* block_dims, dim3* grid_dims) {
   // Set the thread block and grid according to CurrentDeviceId
   const int kThreadsPerBlock = 1024;
   int block_cols = kThreadsPerBlock;
@@ -213,12 +213,12 @@ static inline void GetBlockDims(const platform::CUDADeviceContext& context,
   *block_dims = dim3(block_cols, block_rows, 1);
 
   int max_threads = context.GetMaxPhysicalThreadCount();
-  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
   int grid_cols =
       std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows =
-      std::min(max_blocks / grid_cols, std::max(num_rows / block_rows, 1));
+  int grid_rows = std::min(max_blocks / grid_cols,
+                           std::max(num_rows / block_rows, (int64_t)1));
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
@@ -319,22 +319,22 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
                   int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
-    int out_row = 1;
+    int64_t out_row = 1;
     auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
       out_row *= dim_0[i];
     }
 
-    int out0_col = ref_inputs[0]->numel() / out_row;
-    int in_col = 0, in_row = out_row;
+    int64_t out0_col = ref_inputs[0]->numel() / out_row;
+    int64_t in_col = 0, in_row = out_row;
     bool has_same_shape = true;
 
     std::vector<T*> outputs_data(o_num);
-    std::vector<int> outputs_cols(o_num + 1);
+    std::vector<int64_t> outputs_cols(o_num + 1);
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
-      int t_col = ref_inputs.at(i)->numel() / out_row;
+      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
       if (has_same_shape) {
         if (t_col != out0_col) has_same_shape = false;
       }
@@ -384,13 +384,13 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context,
 
-                        outputs_cols.size() * sizeof(int));
+                        outputs_cols.size() * sizeof(int64_t));
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
                    tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
                    reinterpret_cast<void*>(outputs_cols.data()),
-                   outputs_cols.size() * sizeof(int), context.stream());
-      int* dev_outs_col_data =
-          reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
+                   outputs_cols.size() * sizeof(int64_t), context.stream());
+      int64_t* dev_outs_col_data =
+          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
       SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 84fa0d6af990e2..55662e1d0aad7a 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -66,18 +66,23 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
     int batch_size = prob->dims()[0];
     int class_num = prob->dims()[1];
+#ifdef __HIPCC__
+    constexpr int kMaxBlockDim = 256;
+#else
+    constexpr int kMaxBlockDim = 512;
+#endif
 
     if (softLabel) {
       const T* label_data = labels->data<T>();
-      int block = class_num > 512
-                      ? 512
+      int block = class_num > kMaxBlockDim
+                      ? kMaxBlockDim
                       : pow(2, static_cast<int>(std::log2(class_num)));
 
       SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, class_num);
     } else {
       const int64_t* label_data = labels->data<int64_t>();
-      int block = 512;
+      int block = kMaxBlockDim;
       int grid = (batch_size + block - 1) / block;
       CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, batch_size, class_num,
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 7439a959d38285..7c5f59fab0d280 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -22,6 +22,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -52,8 +53,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
       const int filter_multiplier, const int filter_height,                    \
       const int filter_width, const int stride_height, const int stride_width, \
       const int padding_height, const int padding_width,                       \
-      const int dilate_height, const int dilate_width, T *const output_data,   \
-      const DataLayout data_layout = DataLayout::kNCHW
+      const int dilate_height, const int dilate_width, T *const output_data
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
@@ -123,7 +123,6 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
   const int batch = idx / output_width / output_height / output_channels;
 
   const int c_in = c_out / filter_multiplier;
-  const T* weight = filter_data + c_out * filter_height * filter_width;
   T value = 0;
   const int h_in_start = -padding_height + h_out * stride_height;
   const int w_in_start = -padding_width + w_out * stride_width;
@@ -142,13 +141,14 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
     for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
       if (h_in >= h_start && h_in < h_end && w_in >= w_start && w_in < w_end) {
         int offset = ((batch * input_height + h_in) * input_width + w_in) *
-                         output_channels +
+                         input_channels +
                      c_in;
         T in_data = input_data[offset];
+        const T* weight = filter_data + weight_offset * output_channels + c_out;
         if (fuse_relu_before_conv) {
-          value += weight[weight_offset] * max(0.0f, in_data);
+          value += weight[0] * max(0.0f, in_data);
         } else {
-          value += weight[weight_offset] * in_data;
+          value += weight[0] * in_data;
         }
       }
       weight_offset++;
@@ -161,10 +161,10 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
 }
 
 template <typename T, int c_filter, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvCFilter(
+__device__ __inline__ void KernelDepthwiseConvCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConv) {
-  const int kWeghtSize = c_filter * c_filter;
-  T r_weight[kWeghtSize];
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
   const int batch = blockIdx.y;
   const int c_out = blockIdx.x;
   const T* weight = filter_data + c_out * c_filter * c_filter;
@@ -182,13 +182,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
       const int h_in_end = h_in_start + c_filter * dilate_height;
       const int w_in_end = w_in_start + c_filter * dilate_width;
 
-      int in_offset;
-      if (data_layout != DataLayout::kNHWC) {
-        in_offset =
-            ((batch * input_channels + c_in) * input_height) * input_width;
-      } else {
-        in_offset = batch * input_height * input_width * input_channels;
-      }
+      int in_offset =
+          ((batch * input_channels + c_in) * input_height) * input_width;
 
       const int h_end = h_in_end < input_height ? h_in_end : input_height;
       const int w_end = w_in_end < input_width ? w_in_end : input_width;
@@ -201,13 +196,63 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
              w_in += dilate_width, w_f++) {
           if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
               w_in < input_width) {
-            int offset;
-            if (data_layout != DataLayout::kNHWC) {
-              offset = in_offset + h_in * input_width + w_in;
+            int offset = in_offset + h_in * input_width + w_in;
+            if (fuse_relu_before_conv) {
+              value += r_weight[h_f * c_filter + w_f] *
+                       max(0.0f, input_data[offset]);
             } else {
-              offset = in_offset +
-                       (h_in * input_width + w_in) * input_channels + c_in;
+              value += r_weight[h_f * c_filter + w_f] * input_data[offset];
             }
+          }
+        }
+      }
+      int index =
+          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+          w_out;
+      output_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
+    ARG_DEFINE_KernelDepthwiseConv) {
+  const int batch = blockIdx.z;
+  int h_out = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_out >= output_height) {
+    return;
+  }
+  int in_offset = batch * input_height * input_width * input_channels;
+  int out_offset =
+      (batch * output_height + h_out) * output_width * output_channels;
+  const int h_in_start = -padding_height + h_out * stride_height;
+  const int wi_size = (output_width + dilate_width - 1) / dilate_width;
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
+
+  for (int c_out = threadIdx.x; c_out < output_channels; c_out += blockDim.x) {
+    for (int i = 0; i < c_filter * c_filter; i++) {
+      const T* weight = filter_data + i * output_channels + c_out;
+      r_weight[i] = weight[0];
+    }
+    const int c_in = c_out / filter_multiplier;
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int w_out = i_wi * dilate_width + i_dw;
+      if (w_out >= output_width) {
+        continue;
+      }
+      T value = 0;
+      const int w_in_start = -padding_width + w_out * stride_width;
+      for (int h_in = h_in_start, h_f = 0; h_f < c_filter;
+           h_in += dilate_height, h_f++) {
+        for (int w_in = w_in_start, w_f = 0; w_f < c_filter;
+             w_in += dilate_width, w_f++) {
+          if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
+              w_in < input_width) {
+            int offset =
+                in_offset + (h_in * input_width + w_in) * input_channels + c_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
                        max(0.0f, input_data[offset]);
@@ -217,23 +262,14 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
           }
         }
       }
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index = ((batch * gridDim.x + c_out) * output_height + h_out) *
-                    output_width +
-                w_out;
-      } else {
-        index = ((batch * output_height + h_out) * output_width + w_out) *
-                    gridDim.x +
-                c_out;
-      }
+      int index = out_offset + w_out * output_channels + c_out;
       output_data[index] = value;
     }
   }
 }
 
 template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          bool fuse_relu_before_conv>
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
@@ -244,28 +280,37 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
     w_stride = c_stride;
   }
   if (c_filter == -1) {
-    if (data_layout == DataLayout::kNCHW) {
+    if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           final_filter_multiplier, filter_height, filter_width, h_stride,
           w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data, data_layout);
+          output_data);
     } else {
       KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           final_filter_multiplier, filter_height, filter_width, h_stride,
           w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data, data_layout);
+          output_data);
     }
   } else {
-    KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
-        input_data, filter_data, batch_size, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        final_filter_multiplier, filter_height, filter_width, h_stride,
-        w_stride, padding_height, padding_width, dilate_height, dilate_width,
-        output_data, data_layout);
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
+    } else {
+      KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
+    }
   }
 }
 
@@ -280,40 +325,27 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
       const int filter_width, const int stride_height, const int stride_width, \
       const int padding_height, const int padding_width,                       \
       const int dilate_height, const int dilate_width,                         \
-      T *const input_grad_data,                                                \
-      const DataLayout data_layout = DataLayout::kNCHW
+      T *const input_grad_data
 
 template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvInputGrad(
+__device__ __inline__ void KernelDepthwiseConvInputGradNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  const int batch = blockIdx.y;
+  const int c_in = blockIdx.x;
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_in = blockIdx.x;
-
       const int c_out_start = c_in * filter_multiplier;
-
       int h_out_start =
           h_in - (filter_height - 1) * dilate_height + padding_height;
-
       int h_out_end = h_in + padding_height;
-
       int w_out_start =
           w_in - (filter_width - 1) * dilate_width + padding_width;
-
       int w_out_end = w_in + padding_width;
 
       T value = 0;
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index =
-            ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-            w_in;
-      } else {
-        index =
-            ((batch * input_height + h_in) * input_width + w_in) * gridDim.x +
-            c_in;
-      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
 
       if (fuse_relu_before_conv) {
         if (input_data[index] <= 0) {
@@ -335,20 +367,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset;
-              if (data_layout != DataLayout::kNHWC) {
-                output_grad_offset =
-                    ((batch * output_channels + c_out) * output_height +
-                     s_h_out) *
-                        output_width +
-                    s_w_out;
-              } else {
-                output_grad_offset =
-                    ((batch * output_height + s_h_out) * output_width +
-                     s_w_out) *
-                        output_channels +
-                    c_out;
-              }
+              int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
+              value += output_grad_data[output_grad_offset] *
+                       filter_data[filter_offset];
+            }
+          }
+        }
+      }
+      input_grad_data[index] = value;
+    }
+  }
+}
+
+template <typename T, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  const int batch = blockIdx.z;
+  int h_in = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_in >= input_height) {
+    return;
+  }
+
+  for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) {
+    for (int w_in = threadIdx.y; w_in < input_width; w_in += blockDim.y) {
+      int h_out_start =
+          h_in - (filter_height - 1) * dilate_height + padding_height;
+      int w_out_start =
+          w_in - (filter_width - 1) * dilate_width + padding_width;
+
+      T value = 0;
+      int index = ((batch * input_height + h_in) * input_width + w_in) *
+                      input_channels +
+                  c_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
+
+      for (int c_i = 0; c_i < filter_multiplier; c_i++) {
+        int c_out = c_in * filter_multiplier + c_i;
+        int weight_offset = filter_height * filter_width;
+        for (int h_out = h_out_start, h_f = 0; h_f < filter_height;
+             h_out += dilate_height, h_f++) {
+          for (int w_out = w_out_start, w_f = 0; w_f < filter_width;
+               w_out += dilate_width, w_f++) {
+            weight_offset--;
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              int output_grad_offset =
+                  ((batch * output_height + s_h_out) * output_width + s_w_out) *
+                      output_channels +
+                  c_out;
+              int filter_offset = weight_offset * output_channels + c_out;
               value += output_grad_data[output_grad_offset] *
                        filter_data[filter_offset];
             }
@@ -362,10 +441,10 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
 
 template <typename T, int c_filter, int c_filter_multiplier,
           bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
+__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
-  T r_weight[kWeghtSize];
+  const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1;
+  T r_weight[kWeightSize];
   const int batch = blockIdx.y;
   const int c_in = blockIdx.x;
 
@@ -379,24 +458,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
 
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_in = blockIdx.x;
-
       int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
-
       int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
 
       T value = 0;
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index =
-            ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-            w_in;
-      } else {
-        index =
-            ((batch * input_height + h_in) * input_width + w_in) * gridDim.x +
-            c_in;
-      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
       if (fuse_relu_before_conv) {
         if (input_data[index] <= 0) {
           input_grad_data[index] = 0;
@@ -415,20 +483,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset;
-              if (data_layout != DataLayout::kNHWC) {
-                output_grad_offset =
-                    ((batch * output_channels + c_out) * output_height +
-                     s_h_out) *
-                        output_width +
-                    s_w_out;
-              } else {
-                output_grad_offset =
-                    ((batch * output_height + s_h_out) * output_width +
-                     s_w_out) *
-                        output_channels +
-                    c_out;
-              }
+              int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
               value +=
                   output_grad_data[output_grad_offset] *
                   r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
@@ -441,47 +500,137 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+template <typename T, int c_filter, int c_filter_multiplier,
           bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  int h_in = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_in >= input_height) {
+    return;
+  }
+  const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1;
+  T r_weight[kWeightSize];
+  const int batch = blockIdx.z;
+  const int wi_size = (input_width + dilate_width - 1) / dilate_width;
+  const int h_out_start =
+      h_in - (c_filter - 1) * dilate_height + padding_height;
+
+  for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) {
+    for (int c_i = 0; c_i < c_filter_multiplier; c_i++) {
+      int c_out = c_in * c_filter_multiplier + c_i;
+      for (int i = 0; i < c_filter * c_filter; i++)
+        r_weight[i + c_i * c_filter * c_filter] =
+            filter_data[(c_filter * c_filter - i - 1) * output_channels +
+                        c_out];
+    }
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int w_in = i_wi * dilate_width + i_dw;
+      if (w_in >= input_width) {
+        continue;
+      }
+      int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
+
+      T value = 0;
+      int index = ((batch * input_height + h_in) * input_width + w_in) *
+                      input_channels +
+                  c_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
+
+      for (int c_i = 0; c_i < c_filter_multiplier; c_i++) {
+        int c_out = c_in * c_filter_multiplier + c_i;
+        for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
+             h_out += dilate_height, h_f++) {
+          for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
+               w_out += dilate_width, w_f++) {
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              int output_grad_offset =
+                  ((batch * output_height + s_h_out) * output_width + s_w_out) *
+                      output_channels +
+                  c_out;
+              value +=
+                  output_grad_data[output_grad_offset] *
+                  r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
+            }
+          }
+        }
+      }
+      input_grad_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, input_grad_data, data_layout);
-  else if (c_filter == -1)
-    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
-        padding_height, padding_width, dilate_height, dilate_width,
-        input_grad_data, data_layout);
-  else
-    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier,
-                                        fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
-        padding_height, padding_width, dilate_height, dilate_width,
-        input_grad_data, data_layout);
+  int final_filter_multiplier = filter_multiplier;
+  int h_stride = stride_height;
+  int w_stride = stride_width;
+  if (c_filter_multiplier != 0) {
+    final_filter_multiplier = c_filter_multiplier;
+    h_stride = c_stride;
+    w_stride = c_stride;
+  }
+
+  if (c_filter_multiplier == 0 || c_filter == -1) {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, final_filter_multiplier, filter_height,
+          filter_width, h_stride, w_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    } else {
+      KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, final_filter_multiplier, filter_height,
+          filter_width, h_stride, w_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    }
+  } else {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvInputGradCFilterNCHW<T, c_filter, c_filter_multiplier,
+                                              fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, c_filter_multiplier, filter_height,
+          filter_width, c_stride, c_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    } else {
+      KernelDepthwiseConvInputGradCFilterNHWC<T, c_filter, c_filter_multiplier,
+                                              fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, c_filter_multiplier, filter_height,
+          filter_width, c_stride, c_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    }
+  }
 }
 
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvFilterGrad(
+__device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
     const int input_channels, const int input_height, const int input_width,
     const int filter_multiplier, const int filter_height,
     const int filter_width, const int stride_height, const int stride_width,
     const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data,
-    const DataLayout data_layout = DataLayout::kNCHW) {
+    const int dilate_width, T* filter_grad_data) {
   T s = 0;
-
   int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
 
   for (int image_w = threadIdx.x; image_w < output_width;
@@ -499,45 +648,137 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad(
         if (image_wk < 0 || image_wk >= input_width) continue;
 #define gaid(N, C, H, W) \
   ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
-#define gaid_nhwc(N, H, W, C) \
-  ((((N)*output_height + (H)) * output_width + (W)) * gridDim.z + (C))
-        int input_id;
-        if (data_layout != DataLayout::kNHWC) {
-          input_id = ((bid * (gridDim.z / filter_multiplier) +
-                       kernel_id / filter_multiplier) *
-                          input_height +
-                      image_hk) *
-                         input_width +
-                     image_wk;
-          if (fuse_relu_before_conv) {
-            s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-                 max(0.0f, input_data[input_id]);
-          } else {
-            s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-                 input_data[input_id];
-          }
+        int input_id = ((bid * (gridDim.z / filter_multiplier) +
+                         kernel_id / filter_multiplier) *
+                            input_height +
+                        image_hk) *
+                           input_width +
+                       image_wk;
+        if (fuse_relu_before_conv) {
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               max(0.0f, input_data[input_id]);
         } else {
-          input_id =
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               input_data[input_id];
+        }
+#undef gaid
+      }
+    }
+  }
+  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
+}
+
+template <typename T, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  int bid = blockIdx.z;
+  int image_h = blockIdx.y;
+  int kernel_iw = blockIdx.x % filter_width;
+  int kernel_ih = blockIdx.x / filter_width;
+  for (int kernel_id = threadIdx.x; kernel_id < output_channels;
+       kernel_id += blockDim.x) {
+    T s = 0;
+    int gbid =
+        ((kernel_id * filter_height) + kernel_ih) * filter_width + kernel_iw;
+    for (int image_w = threadIdx.y; image_w < output_width;
+         image_w += blockDim.y) {
+      int kernel_h = kernel_ih * dilate_height - padding_height;
+      int kernel_w = kernel_iw * dilate_width - padding_width;
+
+      int image_hk = image_h * stride_height + kernel_h;
+      int image_wk = image_w * stride_width + kernel_w;
+      if (image_hk < 0 || image_hk >= input_height) continue;
+      if (image_wk < 0 || image_wk >= input_width) continue;
+#define gaid(N, H, W, C) \
+  ((((N)*output_height + (H)) * output_width + (W)) * output_channels + (C))
+      int input_id =
+          ((bid * input_height + image_hk) * input_width + image_wk) *
+              input_channels +
+          kernel_id / filter_multiplier;
+      if (fuse_relu_before_conv) {
+        s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
+             max(0.0f, input_data[input_id]);
+      } else {
+        s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
+             input_data[input_id];
+      }
+#undef gaid
+    }
+    platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
+  }
+}
+
+template <typename T, int c_filter, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  const int bid = blockIdx.z;
+  int image_h = blockIdx.x * dilate_height + blockIdx.y;
+  if (image_h >= output_height) {
+    return;
+  }
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
+  const int wi_size = (output_width + dilate_width - 1) / dilate_width;
+
+  for (int kernel_id = threadIdx.x; kernel_id < output_channels;
+       kernel_id += blockDim.x) {
+    for (int i = 0; i < c_filter * c_filter; ++i) {
+      r_weight[i] = 0;
+    }
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int image_w = i_wi * dilate_width + i_dw;
+      if (image_w >= output_width) {
+        continue;
+      }
+      for (int kernel_ih = 0; kernel_ih < c_filter; ++kernel_ih) {
+        for (int kernel_iw = 0; kernel_iw < c_filter; ++kernel_iw) {
+          int kernel_h = kernel_ih * dilate_height - padding_height;
+          int kernel_w = kernel_iw * dilate_width - padding_width;
+          int image_hk = image_h * stride_height + kernel_h;
+          int image_wk = image_w * stride_width + kernel_w;
+          if (image_hk < 0 || image_hk >= input_height) continue;
+          if (image_wk < 0 || image_wk >= input_width) continue;
+          int input_id =
               ((bid * input_height + image_hk) * input_width + image_wk) *
-                  (gridDim.z / filter_multiplier) +
+                  input_channels +
               kernel_id / filter_multiplier;
+          int output_id =
+              ((bid * output_height + image_h) * output_width + image_w) *
+                  output_channels +
+              kernel_id;
+          T s = 0;
           if (fuse_relu_before_conv) {
-            s += output_grad_data[gaid_nhwc(bid, image_h, image_w, kernel_id)] *
-                 max(0.0f, input_data[input_id]);
+            s = output_grad_data[output_id] * max(0.0f, input_data[input_id]);
           } else {
-            s += output_grad_data[gaid_nhwc(bid, image_h, image_w, kernel_id)] *
-                 input_data[input_id];
+            s = output_grad_data[output_id] * input_data[input_id];
           }
+          r_weight[kernel_ih * c_filter + kernel_iw] += s;
         }
-
-#undef gaid
       }
     }
+    for (int i = 0; i < c_filter * c_filter; ++i) {
+      T* weight = filter_grad_data + i * output_channels + kernel_id;
+      platform::CudaAtomicAdd(&weight[0], r_weight[i]);
+    }
   }
-  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
 }
 
-template <typename T, int c_filter_multiplier, bool fuse_relu_before_conv>
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvFilterGradSp(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
@@ -545,22 +786,49 @@ __global__ void KernelDepthwiseConvFilterGradSp(
     const int filter_multiplier, const int filter_height,
     const int filter_width, const int stride_height, const int stride_width,
     const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data,
-    const DataLayout data_layout = DataLayout::kNCHW) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
-        output_grad_data, input_data, num, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, filter_grad_data, data_layout);
-  else
-    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
-        output_grad_data, input_data, num, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, filter_grad_data, data_layout);
+    const int dilate_width, T* filter_grad_data) {
+  int final_filter_multiplier = filter_multiplier;
+  int h_stride = stride_height;
+  int w_stride = stride_width;
+  if (c_filter_multiplier != 0) {
+    final_filter_multiplier = c_filter_multiplier;
+    h_stride = c_stride;
+    w_stride = c_stride;
+  }
+  if (c_filter_multiplier == 0 || c_filter == -1) {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    } else {
+      KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    }
+  } else {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    } else {
+      KernelDepthwiseConvFilterGradCFilterNHWC<T, c_filter,
+                                               fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    }
+  }
 }
 
 /*
@@ -608,40 +876,85 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
     const T* filter_data = filter.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
+    framework::Tensor filter_hwc;
+    if (data_layout == DataLayout::kNHWC) {
+      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
+                                       filter.dims()[0], filter.dims()[1]});
+      filter_hwc.Resize(filter_hwc_dims);
+      filter_hwc.mutable_data<T>(context.GetPlace());
+      std::vector<int> perm_axis({2, 3, 0, 1});
+      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      trans(context, filter, &filter_hwc, perm_axis);
+      filter_data = filter_hwc.data<T>();
+    }
+
     int thread = 512;
-    if (output_width > 1024 && output_width <= 2048)
-      thread = (output_width - 1) / 2 + 1;
-    else if (output_width > 512 && output_width <= 1024)
-      thread = output_width;
-    int blocks = std::min(std::max(thread / output_width, 1), output_height);
-    dim3 threads(std::min(output_width, thread), blocks, 1);
-    dim3 grid(output_channels, batch_size, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (output_width > 1024 && output_width <= 2048)
+        thread = (output_width - 1) / 2 + 1;
+      else if (output_width > 512 && output_width <= 1024)
+        thread = output_width;
+#ifdef __HIPCC__
+      thread = std::min(thread, 256);
+#endif
+      blocks = std::min(std::max(thread / output_width, 1), output_height);
+      threads = dim3(std::min(output_width, thread), blocks, 1);
+      grid = dim3(output_channels, batch_size, 1);
+    } else {
+#ifdef __HIPCC__
+      thread = std::min(thread, 256);
+#endif
+      blocks = std::min(
+          std::max(thread / output_channels, 1),
+          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
+      threads = dim3(std::min(output_channels, thread), blocks, 1);
+      grid = dim3((output_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+    }
     int filter_multiplier = output_channels / input_channels;
-
     int nums_output =
         batch_size * output_channels * output_height * output_width;
+#ifdef __HIPCC__
+    int block_size = 256;
+#else
     int block_size = 512;
-
-#define check_case(c_filter_multiplier, c_stride, c_filter)                  \
-  if (c_filter_multiplier == 0 ||                                            \
-      filter_multiplier == c_filter_multiplier &&                            \
-          stride_height == stride_width && stride_height == c_stride &&      \
-          (ksize_height == ksize_width && ksize_height == c_filter ||        \
-           c_filter == -1)) {                                                \
-    if (c_filter == -1) {                                                    \
-      threads.x = block_size;                                                \
-      grid.x = (nums_output + block_size - 1) / block_size;                  \
-      threads.y = threads.z = grid.y = grid.z = 1;                           \
-    }                                                                        \
-    KernelDepthwiseConvSp<                                                   \
-        T, c_filter_multiplier, c_stride, c_filter,                          \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-        input_data, filter_data, batch_size, output_channels, output_height, \
-        output_width, input_channels, input_height, input_width,             \
-        filter_multiplier, ksize_height, ksize_width, stride_height,         \
-        stride_width, padding_height, padding_width, dilate_height,          \
-        dilate_width, output_data, data_layout);                             \
-    return;                                                                  \
+#endif
+    int grid_size = (nums_output + block_size - 1) / block_size;
+
+#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
+  if (c_filter_multiplier == 0 ||                                              \
+      filter_multiplier == c_filter_multiplier &&                              \
+          stride_height == stride_width && stride_height == c_stride &&        \
+          (ksize_height == ksize_width && ksize_height == c_filter ||          \
+           c_filter == -1)) {                                                  \
+    if (c_filter == -1) {                                                      \
+      threads.x = block_size;                                                  \
+      grid.x = grid_size;                                                      \
+      threads.y = threads.z = grid.y = grid.z = 1;                             \
+    }                                                                          \
+    if (data_layout != DataLayout::kNHWC) {                                    \
+      KernelDepthwiseConvSp<                                                   \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          input_data, filter_data, batch_size, output_channels, output_height, \
+          output_width, input_channels, input_height, input_width,             \
+          filter_multiplier, ksize_height, ksize_width, stride_height,         \
+          stride_width, padding_height, padding_width, dilate_height,          \
+          dilate_width, output_data);                                          \
+    } else {                                                                   \
+      KernelDepthwiseConvSp<                                                   \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          input_data, filter_data, batch_size, output_channels, output_height, \
+          output_width, input_channels, input_height, input_width,             \
+          filter_multiplier, ksize_height, ksize_width, stride_height,         \
+          stride_width, padding_height, padding_width, dilate_height,          \
+          dilate_width, output_data);                                          \
+    }                                                                          \
+    return;                                                                    \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -705,32 +1018,67 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
+    framework::Tensor filter_hwc;
+    if (data_layout == DataLayout::kNHWC) {
+      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
+                                       filter.dims()[0], filter.dims()[1]});
+      filter_hwc.Resize(filter_hwc_dims);
+      filter_hwc.mutable_data<T>(context.GetPlace());
+      std::vector<int> perm_axis({2, 3, 0, 1});
+      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      trans(context, filter, &filter_hwc, perm_axis);
+      filter_data = filter_hwc.data<T>();
+    }
+
     int thread = 512;
-    if (input_width > 1024 && input_width <= 2048)
-      thread = (input_width - 1) / 2 + 1;
-    else if (input_width > 512 && input_width <= 1024)
-      thread = input_width;
-    int blocks = std::min(std::max(thread / input_width, 1), input_height);
-    dim3 threads(std::min(input_width, thread), blocks, 1);
-    dim3 grid(input_channels, batch_size, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (input_width > 1024 && input_width <= 2048) {
+        thread = (input_width - 1) / 2 + 1;
+      } else if (input_width > 512 && input_width <= 1024) {
+        thread = input_width;
+      }
+      blocks = std::min(std::max(thread / input_width, 1), input_height);
+      threads = dim3(std::min(input_width, thread), blocks, 1);
+      grid = dim3(input_channels, batch_size, 1);
+    } else {
+      blocks = std::min(
+          std::max(thread / input_channels, 1),
+          ((input_width + dilate_width - 1) / dilate_width) * dilate_width);
+      threads = dim3(std::min(input_channels, thread), blocks, 1);
+      grid = dim3((input_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+    }
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)             \
-  if (c_filter_multiplier == 0 ||                                       \
-      filter_multiplier == c_filter_multiplier &&                       \
-          stride_height == stride_width && stride_height == c_stride && \
-          (ksize_height == ksize_width && ksize_height == c_filter ||   \
-           c_filter == -1)) {                                           \
-    KernelDepthwiseConvInputGradSp<                                     \
-        T, c_filter_multiplier, c_stride, c_filter,                     \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-        input_data, output_grad_data, filter_data, batch_size,          \
-        output_channels, output_height, output_width, input_channels,   \
-        input_height, input_width, filter_multiplier, ksize_height,     \
-        ksize_width, stride_height, stride_width, padding_height,       \
-        padding_width, dilate_height, dilate_width, input_grad_data,    \
-        data_layout);                                                   \
-    return;                                                             \
+#define check_case(c_filter_multiplier, c_stride, c_filter)               \
+  if (c_filter_multiplier == 0 ||                                         \
+      filter_multiplier == c_filter_multiplier &&                         \
+          stride_height == stride_width && stride_height == c_stride &&   \
+          (ksize_height == ksize_width && ksize_height == c_filter ||     \
+           c_filter == -1)) {                                             \
+    if (data_layout != DataLayout::kNHWC) {                               \
+      KernelDepthwiseConvInputGradSp<                                     \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,  \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data, output_grad_data, filter_data, batch_size,          \
+          output_channels, output_height, output_width, input_channels,   \
+          input_height, input_width, filter_multiplier, ksize_height,     \
+          ksize_width, stride_height, stride_width, padding_height,       \
+          padding_width, dilate_height, dilate_width, input_grad_data);   \
+    } else {                                                              \
+      KernelDepthwiseConvInputGradSp<                                     \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,  \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data, output_grad_data, filter_data, batch_size,          \
+          output_channels, output_height, output_width, input_channels,   \
+          input_height, input_width, filter_multiplier, ksize_height,     \
+          ksize_width, stride_height, stride_width, padding_height,       \
+          padding_width, dilate_height, dilate_width, input_grad_data);   \
+    }                                                                     \
+    return;                                                               \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -793,30 +1141,95 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
     T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
 
     int block_size = 512;
-    if (output_width > 1024 && output_width <= 2048)
-      block_size = (output_width - 1) / 2 + 1;
-    else if (output_width > 512 && output_width <= 1024)
-      block_size = output_width;
-    int crop_output_height =
-        std::min(std::max(block_size / output_width, 1), output_height);
-    dim3 grid(ksize_width, ksize_height, output_channels);
-    dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (output_width > 1024 && output_width <= 2048) {
+        block_size = (output_width - 1) / 2 + 1;
+      } else if (output_width > 512 && output_width <= 1024) {
+        block_size = output_width;
+      }
+      blocks = std::min(std::max(block_size / output_width, 1), output_height);
+      grid = dim3(ksize_width, ksize_height, output_channels);
+      threads = dim3(std::min(output_width, block_size), blocks, 1);
+    } else {
+      blocks = std::min(
+          std::max(block_size / output_channels, 1),
+          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
+      grid = dim3((output_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+      threads = dim3(std::min(output_channels, block_size), blocks, 1);
+    }
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier)                                       \
-  if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
-    KernelDepthwiseConvFilterGradSp<                                          \
-        T, c_filter_multiplier,                                               \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(       \
-        output_grad_data, input_data, batch_size, output_channels,            \
-        output_height, output_width, input_channels, input_height,            \
-        input_width, filter_multiplier, ksize_height, ksize_width,            \
-        stride_height, stride_width, padding_height, padding_width,           \
-        dilate_height, dilate_width, filter_grad_data, data_layout);          \
-    return;                                                                   \
+#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
+  if (c_filter_multiplier == 0 ||                                              \
+      filter_multiplier == c_filter_multiplier &&                              \
+          stride_height == stride_width && stride_height == c_stride &&        \
+          (ksize_height == ksize_width && ksize_height == c_filter ||          \
+           c_filter == -1)) {                                                  \
+    if (data_layout != DataLayout::kNHWC) {                                    \
+      KernelDepthwiseConvFilterGradSp<                                         \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          output_grad_data, input_data, batch_size, output_channels,           \
+          output_height, output_width, input_channels, input_height,           \
+          input_width, filter_multiplier, ksize_height, ksize_width,           \
+          stride_height, stride_width, padding_height, padding_width,          \
+          dilate_height, dilate_width, filter_grad_data);                      \
+    } else {                                                                   \
+      framework::Tensor filter_grad_hwc;                                       \
+      if (c_filter != -1) {                                                    \
+        framework::DDim filter_grad_hwc_dims(                                  \
+            {filter_grad->dims()[2], filter_grad->dims()[3],                   \
+             filter_grad->dims()[0], filter_grad->dims()[1]});                 \
+        filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
+        filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
+        math::SetConstant<platform::CUDADeviceContext, T> set_zero;            \
+        set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
+        filter_grad_data = filter_grad_hwc.data<T>();                          \
+      } else {                                                                 \
+        block_size = 512;                                                      \
+        if (output_channels > 1024 && output_channels <= 2048) {               \
+          block_size = (output_channels - 1) / 2 + 1;                          \
+        } else if (output_channels > 512 && output_channels <= 1024) {         \
+          block_size = output_channels;                                        \
+        }                                                                      \
+        blocks =                                                               \
+            std::min(std::max(block_size / output_channels, 1), output_width); \
+        grid = dim3(ksize_width * ksize_height, output_height, batch_size);    \
+        threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
+      }                                                                        \
+      KernelDepthwiseConvFilterGradSp<                                         \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          output_grad_data, input_data, batch_size, output_channels,           \
+          output_height, output_width, input_channels, input_height,           \
+          input_width, filter_multiplier, ksize_height, ksize_width,           \
+          stride_height, stride_width, padding_height, padding_width,          \
+          dilate_height, dilate_width, filter_grad_data);                      \
+      if (c_filter != -1) {                                                    \
+        std::vector<int> perm_axis({2, 3, 0, 1});                              \
+        math::TransposeNormal<platform::CUDADeviceContext, T> trans;           \
+        trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
+      }                                                                        \
+    }                                                                          \
+    return;                                                                    \
   }
-    check_case(1);
-    check_case(0);
+    check_case(1, 1, 3);
+    check_case(1, 1, 5);
+    check_case(1, 1, -1);
+    check_case(1, 2, 3);
+    check_case(1, 2, 5);
+    check_case(1, 2, -1);
+    check_case(2, 1, 3);
+    check_case(2, 1, 5);
+    check_case(2, 1, -1);
+    check_case(2, 2, 3);
+    check_case(2, 2, 5);
+    check_case(2, 2, -1);
+    check_case(0, 0, -1);
 #undef check_case
   }
 };
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index a61b50faa757cf..68179a68574a01 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -51,6 +51,7 @@ template struct SetConstant<platform::CPUDeviceContext, platform::complex128>;
 template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
 template struct SetConstant<platform::XPUDeviceContext, float>;
 template struct SetConstant<platform::XPUDeviceContext, double>;
+template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
 template struct SetConstant<platform::XPUDeviceContext, int>;
 template struct SetConstant<platform::XPUDeviceContext, int64_t>;
 template struct SetConstant<platform::XPUDeviceContext, bool>;
@@ -148,6 +149,13 @@ void set_constant_with_place<platform::XPUPlace>(
   PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<platform::NPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index cc8925fcf8aeee..2b93cd926081ec 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -35,6 +35,7 @@ using complex128 = paddle::platform::complex128;
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
 template struct SetConstant<platform::CUDADeviceContext, double>;
+template struct SetConstant<platform::CUDADeviceContext, uint8_t>;
 template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc
index 25bc5d725e1fd0..60481491cb4b40 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cc
@@ -23,34 +23,10 @@ namespace math {
 
 template <typename T>
 class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
-  using Matrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using EigenMatrixMap = Eigen::Map<Matrix>;
-  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
-
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& a, framework::Tensor* a_inv) {
-    const auto& mat_dims = a.dims();
-    const int rank = mat_dims.size();
-    int n = mat_dims[rank - 1];
-    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
-
-    const T* a_ptr = a.data<T>();
-    T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; ++i) {
-      ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
-      EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
-      Eigen::PartialPivLU<Matrix> lu;
-      lu.compute(mat);
-
-      const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-      PADDLE_ENFORCE_GT(
-          min_abs_pivot, static_cast<T>(0),
-          platform::errors::InvalidArgument("Input is not invertible."));
-      mat_inv.noalias() = lu.inverse();
-    }
+    compute_inverse_eigen<platform::CPUDeviceContext, T>(context, a, a_inv);
   }
 };
 
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
index 7f5df114680552..5deedf084c6970 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
@@ -32,6 +33,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& a, framework::Tensor* a_inv) {
+#ifndef PADDLE_WITH_HIP
     const auto& mat_dims = a.dims();
     const int rank = mat_dims.size();
     int n = mat_dims[rank - 1];
@@ -111,6 +113,9 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
                             "non-singular matrix",
                             i, info[i], info[i]));
     }
+#else
+    compute_inverse_eigen<platform::CUDADeviceContext, T>(context, a, a_inv);
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/fluid/operators/math/matrix_inverse.h
index f0baf0b250e757..fb58b483666526 100644
--- a/paddle/fluid/operators/math/matrix_inverse.h
+++ b/paddle/fluid/operators/math/matrix_inverse.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "Eigen/Core"
+#include "Eigen/LU"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -22,6 +24,36 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+template <typename DeviceContext, typename T>
+void compute_inverse_eigen(const DeviceContext& context,
+                           const framework::Tensor& a,
+                           framework::Tensor* a_inv) {
+  using Matrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using EigenMatrixMap = Eigen::Map<Matrix>;
+  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+  const auto& mat_dims = a.dims();
+  const int rank = mat_dims.size();
+  int n = mat_dims[rank - 1];
+  int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+  const T* a_ptr = a.data<T>();
+  T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
+
+  for (int i = 0; i < batch_size; ++i) {
+    ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
+    EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
+    Eigen::PartialPivLU<Matrix> lu;
+    lu.compute(mat);
+
+    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+    PADDLE_ENFORCE_GT(
+        min_abs_pivot, static_cast<T>(0),
+        platform::errors::InvalidArgument("Input is not invertible."));
+    mat_inv.noalias() = lu.inverse();
+  }
+}
+
 template <typename DeviceContext, typename T>
 class MatrixInverseFunctor {
  public:
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 21d588cc01f322..3547de0a4d7b7f 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/macros.h"
@@ -46,10 +47,22 @@ class MaxPool {
 
 template <class T>
 class AvgPool {
+  using MT = typename details::MPTypeTrait<T>::Type;
+  MT intermediate_res;
+
  public:
-  DEVICE inline T initial() { return static_cast<T>(0); }
-  DEVICE inline void compute(const T& x, T* y) { *y += x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; }
+  DEVICE inline T initial() {
+    intermediate_res = static_cast<MT>(0.0f);
+    return static_cast<T>(0);
+  }
+
+  DEVICE inline void compute(const T& x, T* y) {
+    intermediate_res += static_cast<MT>(x);
+  }
+
+  DEVICE inline void finalize(const T& pool_field, T* y) {
+    *y = static_cast<T>(intermediate_res / (static_cast<MT>(pool_field)));
+  }
 };
 
 template <class T>
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 879e367281c0a3..9e9fe5b9c1020d 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -54,10 +54,11 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
-      Y->mutable_data<T>(context.GetPlace())));
+      Y->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -96,11 +97,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
       CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
-      XGrad->mutable_data<T>(context.GetPlace())));
+      XGrad->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index f92cff2f6cd216..6fa96aca4be147 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-
 using framework::Tensor;
 
 static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
@@ -123,34 +122,47 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
       mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_;
     }
   }
-  PADDLE_ENFORCE_EQ(
-      mat_dim_a.width_, mat_dim_b.height_,
-      platform::errors::InvalidArgument("Shape mistake in matmul_op, the "
-                                        "first tensor width must be same as "
-                                        "second tensor height, but received "
-                                        "width:%d, height:%d",
-                                        mat_dim_a.width_, mat_dim_b.height_));
+
+  if (mat_dim_a.width_ == mat_dim_b.height_) {
+    if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) {
+      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
+    }
+    if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) {
+      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_,
+                    platform::errors::InvalidArgument(
+                        "Shape mistake in matmul_op, the "
+                        "first tensor width must be same as "
+                        "second tensor height, but received "
+                        "width:%d, height:%d x_dims = %s , y_dims = %s",
+                        mat_dim_a.width_, mat_dim_b.height_,
+                        x_dims.to_str().c_str(), y_dims.to_str().c_str()));
   PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_,
                     platform::errors::InvalidArgument(
                         "Shape mistake in matmul_op, the two input"
                         "tensor batch_size must be same, but received first "
                         "tensor batch_size:%d, second "
-                        "tensor batch_size:%d",
-                        mat_dim_a.batch_size_, mat_dim_b.batch_size_));
+                        "tensor batch_size:%d, x_dims = %s , y_dims = %s",
+                        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+                        x_dims.to_str().c_str(), y_dims.to_str().c_str()));
 
-  T alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+  float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
 
-  float *data_c = out->data<T>();
+  T *data_c = out->data<T>();
   int m = mat_dim_a.height_;
   int n = mat_dim_b.width_;
   int k = mat_dim_a.width_;
+  int batch_size = mat_dim_a.batch_size_;
+
   int ldx = mat_dim_a.trans_ ? m : k;
   int ldy = mat_dim_b.trans_ ? k : n;
   int ldout = n;
-  int batch_size = mat_dim_a.batch_size_;
-
-  if (batch_size == 0) {
-    int r = xpu::fc_fusion<float, float, float, FCT>(
+  if (batch_size <= 1) {
+    int r = 0;
+    r = xpu::fc_fusion<T, T, T, FCT>(
         dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k,
         mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy,
         ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR);
@@ -159,14 +171,32 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
                           "XPU fc_fusion kernel return wrong value[%d %s]", r,
                           XPUAPIErrorMsg[r]));
   } else {
-    int r = xpu::fc_batched<float, float, float, FCT>(
-        dev_ctx.x_context(), batch_size, mat_dim_a.trans_, mat_dim_b.trans_, m,
-        n, k, alpha, x->data<T>(), mat_dim_a.stride_, y->data<T>(),
-        mat_dim_b.stride_, 0.0, data_c, m * n, nullptr, nullptr);
+    // batch matmul
+    int r = xpu::fc_batched<T, T, T, FCT>(
+        dev_ctx.x_context(),                        // Context* ctx,
+        batch_size,                                 // int batch_size,
+        mat_dim_a.trans_,                           // bool x_trans,
+        mat_dim_b.trans_,                           // bool w_trans,
+        m,                                          // int m,
+        n,                                          // int n,
+        k,                                          // int k,
+        alpha,                                      // float alpha,
+        reinterpret_cast<const T *>(x->data<T>()),  // const TX* x,
+        mat_dim_a.stride_,                          // int stride_a,
+        reinterpret_cast<const T *>(y->data<T>()),  // const TW* w,
+        mat_dim_b.stride_,                          // int stride_b,
+        0.0,                                        // float beta,
+        reinterpret_cast<T *>(data_c),              // TY* y,
+        m * n,                                      // int stride_c,
+        nullptr,                                    // const float* x_maxptr,
+        nullptr);                                   // const float* w_maxptr
+
     PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                       platform::errors::External(
-                          "XPU fc_batched kernel return wrong value[%d %s]", r,
-                          XPUAPIErrorMsg[r]));
+                          "XPU fc_batched kernel return wrong value[%d %s] "
+                          "x_dims = %s , y_dims = %s",
+                          r, XPUAPIErrorMsg[r], x_dims.to_str().c_str(),
+                          y_dims.to_str().c_str()));
   }
 }
 
@@ -206,9 +236,8 @@ static framework::Tensor XPUFoldHeadAndLastDims(
                                     static_cast<int>(in_dims[1]),
                                     static_cast<int>(in_dims[2])};
   std::vector<int> axis_host = {1, 0, 2};
-
   int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
-                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+                         in_shape_host, axis_host);
   PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                     platform::errors::External(
                         "XPU transpose kernel return wrong value[%d %s]", r,
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
new file mode 100644
index 00000000000000..d3022056a47ded
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MatMulV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    bool transpose_x = ctx.Attr<bool>("trans_x");
+    bool transpose_y = ctx.Attr<bool>("trans_y");
+
+    if (x->dims().size() == 2) {
+      out->mutable_data<T>(ctx.GetPlace());
+
+      auto runner = NpuOpRunner(
+          "MatMul", {*x, *y}, {*out},
+          {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+
+    } else if (x->dims().size() > 2) {
+      out->mutable_data<T>(ctx.GetPlace());
+
+      auto runner =
+          NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
+                      {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_y = ctx.Attr<bool>("trans_y");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (x->dims().size() == 2) {
+      if (transpose_y) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", false}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*dout, *x}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+
+      } else {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      }
+    } else if (x->dims().size() > 2) {
+      if (transpose_y) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                                       {{"adj_x1", false}, {"adj_x2", false}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy = NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
+                                       {{"adj_x1", true}, {"adj_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      } else {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                                       {{"adj_x1", false}, {"adj_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
+                                       {{"adj_x1", true}, {"adj_x2", false}});
+          runner_dy.Run(stream);
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    matmul_v2,
+    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    matmul_v2_grad,
+    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index dbb1d7bfb0a3d9..d992ef847db2ac 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -57,32 +57,55 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
 
   PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_,
                     platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s",
-                        x_dims.to_str(), y_dims.to_str()));
+                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s "
+                        "x_trans = %d y_trans = %d",
+                        x_dims.to_str(), y_dims.to_str(), mat_dim_a.trans_,
+                        mat_dim_b.trans_));
   PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_,
                     platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s",
-                        x_dims.to_str(), y_dims.to_str()));
+                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s "
+                        "x_trans = %d y_trans = %d",
+                        x_dims.to_str(), y_dims.to_str(), mat_dim_a.trans_,
+                        mat_dim_b.trans_));
 
-  float* data_c = out->data<T>();
+  T* data_c = out->data<T>();
   int m = mat_dim_a.height_;
   int n = mat_dim_b.width_;
   int k = mat_dim_a.width_;
   int batch_size = mat_dim_a.batch_size_;
-
-  if (batch_size == 0) {
-    int r = xpu::fc<float, float, float, FCT>(
-        dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k,
-        mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU fc_fusion kernel return wrong value[%d %s]", r,
-                          XPUAPIErrorMsg[r]));
+  if (batch_size <= 1) {
+    int r = 0;
+    r = xpu::fc<T, T, T, FCT>(dev_ctx.x_context(), x->data<T>(), y->data<T>(),
+                              data_c, m, n, k, mat_dim_a.trans_,
+                              mat_dim_b.trans_, nullptr, nullptr, nullptr);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU fc_fusion kernel return wrong value[%d %s] , m = %d, n = "
+            "%d, "
+            "k = %d, a_tr = %d, b_tr = %d",
+            r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_));
   } else {
-    int r = xpu::fc_batched<float, float, float, FCT>(
-        dev_ctx.x_context(), batch_size, mat_dim_a.trans_, mat_dim_b.trans_, m,
-        n, k, 1.0, x->data<T>(), mat_dim_a.stride_, y->data<T>(),
-        mat_dim_b.stride_, 0.0, data_c, m * n, nullptr, nullptr);
+    // batch matmul
+    int r = xpu::fc_batched<T, T, T, FCT>(
+        dev_ctx.x_context(),                       // Context* ctx,
+        batch_size,                                // int batch_size,
+        mat_dim_a.trans_,                          // bool x_trans,
+        mat_dim_b.trans_,                          // bool w_trans,
+        m,                                         // int m,
+        n,                                         // int n,
+        k,                                         // int k,
+        1.0,                                       // float alpha,
+        reinterpret_cast<const T*>(x->data<T>()),  // const TX* x,
+        mat_dim_a.stride_,                         // int stride_a,
+        reinterpret_cast<const T*>(y->data<T>()),  // const TW* w,
+        mat_dim_b.stride_,                         // int stride_b,
+        0.0,                                       // float beta,
+        reinterpret_cast<T*>(data_c),              // TY* y,
+        m * n,                                     // int stride_c,
+        nullptr,                                   // const float* x_maxptr,
+        nullptr);                                  // const float* w_maxptr
+
     PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                       platform::errors::External(
                           "XPU fc_batched kernel return wrong value[%d %s]", r,
@@ -125,7 +148,7 @@ static framework::Tensor XPUFoldHeadAndLastDims(
   std::vector<int> axis_host = {1, 0, 2};
 
   int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
-                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+                         in_shape_host, axis_host);
   PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                     platform::errors::External(
                         "XPU transpose kernel return wrong value[%d %s]", r,
@@ -189,6 +212,7 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
     auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
     auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
     ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
     framework::DDim dx_dims;
     if (dx) {
       dx_dims = dx->dims();
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
new file mode 100644
index 00000000000000..676086bd080633
--- /dev/null
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MeanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    std::vector<int> axes;
+
+    framework::NPUAttributeMap attr_input = {{"keep_dims", false},
+                                             {"axes", axes}};
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MeanGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(grad->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Mean Gradient Input Tensor len should be 1. But "
+                          "received Out@Grad's elements num is %d.",
+                          grad->numel()));
+
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+
+    // ones
+    Tensor ones(grad->type());
+    ones.mutable_data<T>(IG->dims(), context.GetPlace());
+    auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
+    runner_ones.Run(stream);
+
+    // means
+    Tensor mean_tensor(grad->type());
+    mean_tensor.Resize({1});
+    mean_tensor.mutable_data<T>(context.GetPlace());
+    std::vector<float> mean_vec;
+    mean_vec.push_back(1.0 / static_cast<float>(IG->numel()));
+    framework::TensorFromVector(mean_vec, context.device_context(),
+                                &mean_tensor);
+
+    // means mul ones
+    Tensor mean_ma(grad->type());
+    mean_ma.Resize(IG->dims());
+    mean_ma.mutable_data<T>(context.GetPlace());
+    auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
+    runner_mul_1.Run(stream);
+
+    // and mul grad
+    auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
+    runner_mul_2.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
+
+REGISTER_OP_NPU_KERNEL(
+    mean_grad, ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 12b255329da2d5..33f71b4adc066f 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -108,7 +108,9 @@ class MeshgridGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Out")).size(), 1,
                       platform::errors::InvalidArgument(
-                          "Number of Inputs(Out@Grad) must be larger than 1"));
+                          "Number of Inputs(Out@Grad) should be larger than 1."
+                          "But received Inputs(Out@Grad)' size = %d .",
+                          ctx->Inputs(framework::GradVarName("Out")).size()));
     ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
   }
 
diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h
old mode 100644
new mode 100755
index 11cd43b22045c3..2aad894e11d4b4
--- a/paddle/fluid/operators/meshgrid_op.h
+++ b/paddle/fluid/operators/meshgrid_op.h
@@ -25,10 +25,18 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/errors.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define MESHGRID_TEMPLATE(z, n, data) \
   case n + 1: {                       \
     MeshgridForward<n + 1>(context);  \
@@ -37,10 +45,10 @@
 #define REP_MESHGRID_TEMPLATE(n) BOOST_PP_REPEAT(n, MESHGRID_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
 
-#define MESHGRID_GRAD_CASE(n)     \
-  case n: {                       \
-    MeshgridBackward<n>(context); \
-    break;                        \
+#define MESHGRID_GRAD_CASE(n)         \
+  case n + 1: {                       \
+    MeshgridBackward<n + 1>(context); \
+    break;                            \
   }
 #define MESHGRID_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), MESHGRID_GRAD_CASE(n), )
@@ -60,7 +68,8 @@ class MeshgridKernel : public framework::OpKernel<T> {
       REP_MESHGRID_TEMPLATE(MAX_RANK_SUPPORTED)
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "Only support tensor nums between 1 and 6."));
+            "Excepted Tensor numbers between 1 and 6, but only received d% .",
+            rank));
     }
   }
 
@@ -71,7 +80,9 @@ class MeshgridKernel : public framework::OpKernel<T> {
     auto outs = context.MultiOutput<framework::Tensor>("Out");
     PADDLE_ENFORCE_EQ(
         ins.size() > 1, true,
-        platform::errors::InvalidArgument("expect at least 2 input tensors"));
+        platform::errors::InvalidArgument(
+            "Expected at least 2 input tensors, but only received d%.",
+            ins.size()));
 
     int64_t size = ins.size();
     std::vector<int64_t> shape(size);
@@ -103,19 +114,21 @@ class MeshgridKernel : public framework::OpKernel<T> {
       reshape_ins_tensor.Resize(out_dims_reshape);
       framework::DDim out_dims = framework::make_ddim(shape);
 
-      Eigen::DSizes<int, Rank> bcast_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
       for (int64_t j = 0; j < size; j++) {
         bcast_dims[j] = shape[j];
       }
       bcast_dims[i] = 1;
 
       outs[i]->Resize(out_dims);
-      auto x = framework::EigenTensor<T, Rank>::From(reshape_ins_tensor);
+      auto x = framework::EigenTensor<T, Rank>::From(
+          static_cast<const framework::Tensor>(reshape_ins_tensor));
       outs[i]->mutable_data<T>(context.GetPlace());
       auto y = framework::EigenTensor<T, Rank>::From(*outs[i]);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -131,7 +144,8 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
       REP_MESHGRID_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "only support tensor nums being between 1 and 6."));
+            "Excepted Tensor numbers between 1 and 6, but only received d% .",
+            n));
     }
   }
 
@@ -165,21 +179,20 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
         }
       }
 
-      Eigen::DSizes<int, Rank> reduce_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank> reduce_dims;
       for (int k = 0; k < n; k++) {
         reduce_dims[k] = reduce_dims_vec[k];
       }
 
-      Eigen::DSizes<int, Rank * 2> reshape_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank * 2> reshape_dims;
       for (int k = 0; k < n * 2; k++) {
         reshape_dims[k] = reshape_dims_vec[k];
       }
 
-      auto tensor_reduce_tmp =
-          out_grad_tmp.reshape(reshape_dims).sum(reduce_dims);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      in_grad.device(place) = tensor_reduce_tmp.reshape(in_grad.dimensions());
+      EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, in_grad, out_grad_tmp, reduce_dims, reshape_dims);
     }
   }
 };
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
new file mode 100644
index 00000000000000..4ffcbaf55314a4
--- /dev/null
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AccuracyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* pred = ctx.Input<Tensor>("Out");
+    auto* label = ctx.Input<Tensor>("Label");
+    // auto* logits = ctx.Input<Tensor>("Indices");
+
+    auto* acc = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // cast pred
+    Tensor tmp_pred(pred->type());
+    tmp_pred.Resize(pred->dims());
+    tmp_pred.mutable_data<int>(ctx.GetPlace());
+    auto runner_cast_pred =
+        NpuOpRunner("Cast", {*pred}, {tmp_pred},
+                    {{"dst_type", static_cast<int>(ACL_INT32)}});
+    runner_cast_pred.Run(stream);
+
+    // cast label
+    Tensor tmp_label(label->type());
+    tmp_label.Resize(label->dims());
+    tmp_label.mutable_data<int>(ctx.GetPlace());
+    auto runner_cast_label =
+        NpuOpRunner("Cast", {*label}, {tmp_label},
+                    {{"dst_type", static_cast<int>(ACL_INT32)}});
+    runner_cast_label.Run(stream);
+
+    // equal
+    Tensor tmp_equal(label->type());
+    tmp_equal.Resize(label->dims());
+    tmp_equal.mutable_data<bool>(ctx.GetPlace());
+    auto runner_equal =
+        NpuOpRunner("Equal", {tmp_pred, tmp_label}, {tmp_equal}, {});
+    runner_equal.Run(stream);
+
+    // cast equal
+    Tensor tmp_equal_cast(label->type());
+    tmp_equal_cast.Resize(label->dims());
+    tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
+    auto runner_cast_equal =
+        NpuOpRunner("Cast", {tmp_equal}, {tmp_equal_cast},
+                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+    runner_cast_equal.Run(stream);
+
+    // acc
+    acc->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_1;
+    auto runner_acc = NpuOpRunner("ReduceMeanD", {tmp_equal_cast}, {*acc},
+                                  {{"keep_dims", false}, {"axes", axes_vec_1}});
+    runner_acc.Run(stream);
+
+    // correct
+    correct->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_2;
+    auto runner_correct =
+        NpuOpRunner("ReduceSumD", {tmp_equal_cast}, {*correct},
+                    {{"keep_dims", false}, {"axes", axes_vec_2}});
+    runner_correct.Run(stream);
+
+    // ones_tensor
+    Tensor ones_tensor(label->type());
+    ones_tensor.Resize(label->dims());
+    ones_tensor.mutable_data<int>(ctx.GetPlace());
+    auto runner_oneslike =
+        NpuOpRunner("OnesLike", {tmp_label}, {ones_tensor}, {});
+    runner_oneslike.Run(stream);
+
+    // ones_tensor_cast
+    Tensor ones_tensor_cast(label->type());
+    ones_tensor_cast.Resize(label->dims());
+    ones_tensor_cast.mutable_data<float>(ctx.GetPlace());
+    auto runner_ones_cast =
+        NpuOpRunner("Cast", {ones_tensor}, {ones_tensor_cast},
+                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+    runner_ones_cast.Run(stream);
+
+    // total
+    total->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_3;
+    auto runner_total =
+        NpuOpRunner("ReduceSumD", {ones_tensor_cast}, {*total},
+                    {{"keep_dims", false}, {"axes", axes_vec_3}});
+    runner_total.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    accuracy, ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h
index 7c0faa86be0be0..a357e6e5af6af0 100644
--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
@@ -75,10 +75,11 @@ class ScopedRNNBase {
                              dropout_state, seed_, state_size);
 
     // ------------------- miopen rnn descriptors ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
-        rnn_desc_.desc(), hidden_size_, num_layers_, miopenRNNlinear,
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM,
-        miopenRNNNoBias, miopenRNNdefault, miopen_type));
+        miopenRNNwithBias, miopenRNNdefault, miopen_type));
 
     // ------------------- miopen weights_size ---------------------
     size_t weights_size_;
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
new file mode 100644
index 00000000000000..e0736239d40f28
--- /dev/null
+++ b/paddle/fluid/operators/mul_op_npu.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MulNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
+      if (x->dims().size() == 2 && y->dims().size() == 2) {
+        out->mutable_data<T>(ctx.GetPlace());
+        auto runner =
+            NpuOpRunner("MatMul", {*x, *y}, {*out},
+                        {{"transpose_x1", false}, {"transpose_x2", false}});
+
+        runner.Run(stream);
+      } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+        // reshape
+        Tensor tmp_x(x->type());
+        int64_t sec_dim = x->dims()[1] * x->dims()[2];
+        int64_t first_dim = x->dims()[0];
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        tmp_x.mutable_data<T>(ctx.GetPlace());
+        framework::TensorCopy(
+            *x, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        out->mutable_data<T>(ctx.GetPlace());
+        // matmul
+        auto runner =
+            NpuOpRunner("MatMul", {tmp_x, *y}, {*out},
+                        {{"transpose_x1", false}, {"transpose_x2", false}});
+        runner.Run(stream);
+      } else {
+        PADDLE_THROW(
+            platform::errors::InvalidArgument("npu error: not suppert dims"));
+      }
+      // to do other
+    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
+      PADDLE_ENFORCE_EQ(x_num_col_dims, 2,
+                        platform::errors::InvalidArgument(
+                            "now only support x_num_col_dims == 2: but got %d",
+                            x_num_col_dims));
+      // flatten => x.shape=[6, 4]
+      Tensor tmp_x(x->type());
+      int64_t first_dim = x->dims()[0] * x->dims()[1];
+      int64_t sec_dim = x->dims()[2];
+      tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+      tmp_x.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *x, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+      tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+
+      // matmul [6,4] , [4, 5] => [6, 5]
+      Tensor tmp_matmul(x->type());
+      tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]}));
+      tmp_matmul.mutable_data<T>(ctx.GetPlace());
+
+      auto runner_matmul =
+          NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul},
+                      {{"transpose_x1", false}, {"transpose_x2", false}});
+
+      runner_matmul.Run(stream);
+      // reshape [6, 5] => [2, 3, 5]
+      (*out).Resize(
+          framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]}));
+      out->mutable_data(ctx.GetPlace(), x->type());
+      framework::TensorCopy(
+          tmp_matmul, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+      (*out).Resize(
+          framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]}));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
+      if (x->dims().size() == 2 && y->dims().size() == 2) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+        // flatten => x.shape=[6, 4]
+        // matmul
+        if (dx) {
+          // matmul [2, 5] * [12, 5] => [2, 12]
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto dx_dims = dx->dims();
+          dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]}));
+          auto runner_matmul =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+          runner_matmul.Run(stream);
+          // reshape [2, 12] => [2, 3, 4]
+          dx->Resize(dx_dims);
+        }
+
+        if (dy) {
+          // flatten
+          Tensor tmp_x(x->type());
+          int64_t sec_dim = x->dims()[1] * x->dims()[2];
+          int64_t first_dim = x->dims()[0];
+          tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+          tmp_x.mutable_data<T>(ctx.GetPlace());
+          framework::TensorCopy(
+              *x, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+          tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      }
+    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
+      PADDLE_ENFORCE_EQ(x_num_col_dims, 2,
+                        platform::errors::InvalidArgument(
+                            "now only support x_num_col_dims == 2: but got %d",
+                            x_num_col_dims));
+      // tmp_dout both used by dx and dy
+      Tensor tmp_dout(x->type());
+      int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1];
+      int64_t dout_sec_dim = dout->dims()[2];
+      tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim}));
+      tmp_dout.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *dout, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &tmp_dout);
+      tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim}));
+
+      if (dx) {
+        // tmp_dout * y [6,5] * [4,5] => [6, 4]
+        dx->mutable_data<T>(ctx.GetPlace());
+        auto dx_dims = dx->dims();
+        dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]}));
+        auto runner_matmul =
+            NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx},
+                        {{"transpose_x1", false}, {"transpose_x2", true}});
+        runner_matmul.Run(stream);
+        // reshape [2, 12] => [2, 3, 4]
+        dx->Resize(dx_dims);
+      }
+      if (dy) {
+        // flatten x.shape [2,3,4] => [6, 4]
+        Tensor tmp_x(x->type());
+        int64_t first_dim = x->dims()[0] * x->dims()[1];
+        int64_t sec_dim = x->dims()[2];
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        tmp_x.mutable_data<T>(ctx.GetPlace());
+        framework::TensorCopy(
+            *x, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        // mamtul [6,4] [6,5] =>[4,5]
+        dy->mutable_data<T>(ctx.GetPlace());
+        auto runner_dy =
+            NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy},
+                        {{"transpose_x1", true}, {"transpose_x2", false}});
+        runner_dy.Run(stream);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    mul, ops::MulNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MulNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    mul_grad, ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
new file mode 100644
index 00000000000000..aa0c4d2dfd274e
--- /dev/null
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -0,0 +1,303 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+#include <paddle/fluid/framework/data_type.h>
+#include <paddle/fluid/framework/operator.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+#include "acl/acl_op_compiler.h"
+
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace operators {
+
+static std::map<framework::proto::VarType::Type, aclDataType>
+    DTYPE_2_ACL_DTYPE = {
+        {framework::proto::VarType::BOOL, ACL_BOOL},
+        {framework::proto::VarType::INT16, ACL_INT16},
+        {framework::proto::VarType::INT32, ACL_INT32},
+        {framework::proto::VarType::INT64, ACL_INT64},
+        {framework::proto::VarType::FP16, ACL_FLOAT16},
+        {framework::proto::VarType::FP32, ACL_FLOAT},
+        {framework::proto::VarType::FP64, ACL_DOUBLE},
+};
+
+static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
+    {DataLayout::kNCHW, ACL_FORMAT_NCHW},
+    {DataLayout::kNHWC, ACL_FORMAT_NHWC},
+    {DataLayout::kAnyLayout, ACL_FORMAT_ND},
+};
+
+aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) {
+  auto iter = DTYPE_2_ACL_DTYPE.find(dtype);
+  PADDLE_ENFORCE_NE(iter, DTYPE_2_ACL_DTYPE.end(),
+                    platform::errors::NotFound(
+                        "The data type (%s) can not convert to ACL data type.",
+                        framework::DataTypeToString(dtype)));
+  return iter->second;
+}
+
+aclFormat ConvertToNpuFormat(DataLayout layout) {
+  auto iter = DATA_LAYOUT_2_ACL_FORMAT.find(layout);
+  PADDLE_ENFORCE_NE(
+      iter, DATA_LAYOUT_2_ACL_FORMAT.end(),
+      platform::errors::NotFound(
+          "The data type (%s) can not convert to ACL data type.", layout));
+  return iter->second;
+}
+
+aclrtStream GetCurrentNPUStream() {
+  int device_id = platform::GetCurrentNPUDeviceId();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
+      pool.Get(platform::NPUPlace(device_id)));
+  return dev_ctx->stream();
+}
+
+NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {
+  attr_ = aclopCreateAttr();
+}
+
+NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
+                         const std::vector<Tensor> &outputs,
+                         const NPUAttributeMap &attrs)
+    : op_type_(op_type) {
+  attr_ = aclopCreateAttr();
+  AddInputs(inputs);
+  AddOutputs(outputs);
+  AddAttrs(attrs);
+}
+
+NpuOpRunner::~NpuOpRunner() {
+  // TODO(zhiqiu): handle free
+}
+
+const std::string &NpuOpRunner::Type() { return op_type_; }
+
+NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
+                                  const NPUAttribute &attr) {
+  if (attr.type() == typeid(bool)) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr)));
+  } else if (attr.type() == typeid(int)) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr)));
+
+  } else if (attr.type() == typeid(int64_t)) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr)));
+  } else if (attr.type() == typeid(float)) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr)));
+  } else if (attr.type() == typeid(std::vector<bool>)) {
+    auto a = BOOST_GET_CONST(std::vector<bool>, attr);
+    std::vector<uint8_t> cast_a;
+    for (auto it : a) {
+      cast_a.push_back(static_cast<uint8_t>(it));
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool(
+        attr_, name.c_str(), cast_a.size(), cast_a.data()));
+  } else if (attr.type() == typeid(std::vector<int>)) {
+    auto a = BOOST_GET_CONST(std::vector<int>, attr);
+    std::vector<int64_t> cast_a;
+    for (auto it : a) {
+      cast_a.push_back(static_cast<int64_t>(it));
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrListInt(attr_, name.c_str(), cast_a.size(), cast_a.data()));
+  } else if (attr.type() == typeid(std::vector<int64_t>)) {
+    auto a = BOOST_GET_CONST(std::vector<int64_t>, attr);
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrListInt(attr_, name.c_str(), a.size(), a.data()));
+  } else if (attr.type() == typeid(std::vector<float>)) {
+    auto a = BOOST_GET_CONST(std::vector<float>, attr);
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrListFloat(attr_, name.c_str(), a.size(), a.data()));
+  } else if (attr.type() == typeid(std::string)) {
+    auto a = BOOST_GET_CONST(std::string, attr);
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrString(attr_, name.c_str(), a.c_str()));
+  } else if (attr.type() == typeid(std::vector<std::string>)) {
+    auto a = BOOST_GET_CONST(std::vector<std::string>, attr);
+    std::vector<const char *> s;
+    for (auto &it : a) {
+      s.push_back(it.data());
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrListString(attr_, name.c_str(), s.size(), s.data()));
+  } else if (attr.type() == typeid(std::vector<std::vector<int64_t>>)) {
+    auto a = BOOST_GET_CONST(std::vector<std::vector<int64_t>>, attr);
+    std::vector<int64_t *> data;
+    std::vector<int> num;
+    for (auto &&v : a) {
+      data.push_back(v.data());
+      num.push_back(v.size());
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListListInt(
+        attr_, name.c_str(), data.size(), num.data(), data.data()));
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Can not convert attribubte '%s' to convert to aclopAttr", name));
+  }
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddAttrs(const NPUAttributeMap &attrs) {
+  for (const auto &pair : attrs) {
+    AddAttr(pair.first, pair.second);
+  }
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) {
+  // create aclTensorDesc
+  input_descs_.emplace_back(CreateTensorDesc(tensor));
+  // create aclDataBuffer
+  input_buffers_.emplace_back(CreateDataBuffer(tensor));
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
+  // create aclTensorDesc
+  output_descs_.emplace_back(CreateTensorDesc(tensor));
+  // create aclDataBuffer
+  output_buffers_.emplace_back(CreateDataBuffer(tensor));
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
+  for (auto tensor : tensors) {
+    // create aclTensorDesc
+    input_descs_.emplace_back(CreateTensorDesc(tensor));
+    // create aclDataBuffer
+    input_buffers_.emplace_back(CreateDataBuffer(tensor));
+  }
+  return *this;
+}
+
+// NOTE(zhiqiu): For operators whose input is a list (such as concat, stack),
+// It is needed to set the name of each input tensor.
+NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
+  PADDLE_ENFORCE_EQ(names.size(), input_descs_.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of input names should be "
+                        "equal to the size of input descs, but got the size "
+                        "of input names is %d, the size of input descs is %d.",
+                        names.size(), input_descs_.size()));
+  for (size_t i = 0; i < names.size(); ++i) {
+    aclSetTensorDescName(input_descs_[i], names[i].c_str());
+  }
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
+  for (auto tensor : tensors) {
+    // create aclTensorDesc
+    output_descs_.emplace_back(CreateTensorDesc(tensor));
+    // create aclDataBuffer
+    output_buffers_.emplace_back(CreateDataBuffer(tensor));
+  }
+  return *this;
+}
+
+aclTensorDesc *NpuOpRunner::GetInputDesc(size_t index) {
+  PADDLE_ENFORCE_LT(index, input_descs_.size(),
+                    platform::errors::OutOfRange(
+                        "The index should be less than the size of inputs of "
+                        "operator %s, but got index is %d and size is %d",
+                        Type(), index, input_descs_.size()));
+  return input_descs_[index];
+}
+
+aclTensorDesc *NpuOpRunner::GetOutputDesc(size_t index) {
+  PADDLE_ENFORCE_LT(index, output_descs_.size(),
+                    platform::errors::OutOfRange(
+                        "The index should be less than the size of output of "
+                        "operator %s, but got index is %d and size is %d",
+                        Type(), index, output_descs_.size()));
+  return output_descs_[index];
+}
+
+std::vector<aclTensorDesc *> &NpuOpRunner::GetInputDescs() {
+  return input_descs_;
+}
+
+std::vector<aclTensorDesc *> &NpuOpRunner::GetOutputDescs() {
+  return output_descs_;
+}
+
+std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() {
+  return input_buffers_;
+}
+
+std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
+  return output_buffers_;
+}
+
+aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
+  auto dtype = ConvertToNpuDtype(tensor.type());
+  auto format = ConvertToNpuFormat(tensor.layout());
+  auto dims = framework::vectorize(tensor.dims());
+
+  VLOG(4) << "NPU dtype:" << dtype << " "
+          << "rank:" << dims.size() << " dims:" << tensor.dims()
+          << " format:" << format;
+
+  auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format);
+  PADDLE_ENFORCE_NOT_NULL(
+      desc, platform::errors::External("Call aclCreateTensorDesc failed."));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format));
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclSetTensorStorageShape(desc, dims.size(), dims.data()));
+  return desc;
+}
+
+aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
+  void *ptr = tensor.data<void>();
+  VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size();
+  auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
+  PADDLE_ENFORCE_NOT_NULL(
+      buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
+  return buffer;
+}
+
+void NpuOpRunner::Run(aclrtStream stream) {
+  if (!stream) {
+    VLOG(4) << "Run with default current npu stream: " << stream;
+    stream = GetCurrentNPUStream();
+  }
+
+  VLOG(4) << "op_type: " << op_type_;
+  VLOG(4) << "input_desc.size: " << input_descs_.size();
+  VLOG(4) << "output_desc.size: " << output_descs_.size();
+  VLOG(4) << "attr: " << attr_;
+  VLOG(4) << "stream: " << stream;
+
+  aclError ret = aclopCompileAndExecute(
+      op_type_.c_str(), input_descs_.size(), input_descs_.data(),
+      input_buffers_.data(), output_descs_.size(), output_descs_.data(),
+      output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL,
+      stream);
+  VLOG(4) << "after aclopCompileAndExecute: " << ret;
+  PADDLE_ENFORCE_NPU_SUCCESS(ret);
+}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
new file mode 100644
index 00000000000000..e178f7fc6e96d8
--- /dev/null
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#pragma once
+#include <paddle/fluid/framework/operator.h>
+#include <paddle/fluid/framework/type_defs.h>
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+using NPUAttribute = framework::NPUAttribute;
+using NPUAttributeMap = framework::NPUAttributeMap;
+
+class NpuOpRunner {
+ public:
+  explicit NpuOpRunner(std::string op_type);
+  explicit NpuOpRunner(std::string op_type,
+                       const std::vector<Tensor> &inputs = {},
+                       const std::vector<Tensor> &outputs = {},
+                       const NPUAttributeMap &attrs = {});
+
+  ~NpuOpRunner();
+
+  const std::string &Type();
+
+  NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr);
+
+  NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs);
+
+  NpuOpRunner &AddInput(const Tensor &tensor);
+
+  NpuOpRunner &AddOutput(const Tensor &tensor);
+
+  NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors);
+
+  NpuOpRunner &AddInputNames(const std::vector<std::string> &names);
+
+  NpuOpRunner &AddOutputs(const std::vector<Tensor> &tensors);
+
+  aclTensorDesc *GetInputDesc(size_t index);
+
+  aclTensorDesc *GetOutputDesc(size_t index);
+
+  std::vector<aclTensorDesc *> &GetInputDescs();
+
+  std::vector<aclTensorDesc *> &GetOutputDescs();
+
+  std::vector<aclDataBuffer *> &GetInputBuffers();
+
+  std::vector<aclDataBuffer *> &GetOutputBuffers();
+
+  void Run(aclrtStream stream = nullptr);
+
+ private:
+  aclTensorDesc *CreateTensorDesc(Tensor tensor);
+  aclDataBuffer *CreateDataBuffer(Tensor tensor);
+
+ private:
+  std::string op_type_;
+  std::vector<aclDataBuffer *> input_buffers_;
+  std::vector<aclDataBuffer *> output_buffers_;
+  std::vector<aclTensorDesc *> input_descs_;
+  std::vector<aclTensorDesc *> output_descs_;
+  aclopAttr *attr_{nullptr};
+};
+
+aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
new file mode 100644
index 00000000000000..134544c2f65bc3
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -0,0 +1,161 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class AdamNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto* param = ctx.Input<LoDTensor>("Param");
+    auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Grad(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(param_var->Type())));
+    auto* grad = ctx.Input<LoDTensor>("Grad");
+    auto* mom1 = ctx.Input<LoDTensor>("Moment1");
+    auto* mom2 = ctx.Input<LoDTensor>("Moment2");
+    auto* lr = ctx.Input<LoDTensor>("LearningRate");
+
+    auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
+    auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
+    auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    mom1_out->mutable_data<T>(ctx.GetPlace());
+    mom2_out->mutable_data<T>(ctx.GetPlace());
+    beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+    beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    if (ctx.HasInput("Beta1Tensor")) {
+      auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta1Tensor) size must be 1, but get %d",
+                            beta1_tensor->numel()));
+      beta1 = static_cast<T>(GetAttrFromTensor(beta1_tensor));
+    }
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    if (ctx.HasInput("Beta2Tensor")) {
+      auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta2Tensor) size must be 1, but get %d",
+                            beta2_tensor->numel()));
+      beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
+    }
+    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
+            << "beta2_pow.numel() : " << beta2_pow->numel();
+    VLOG(3) << "param.numel(): " << param->numel();
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    // reshape
+    Tensor beta1_tensor(framework::proto::VarType::FP32);
+    beta1_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<T>{beta1}, ctx.device_context(),
+                     &beta1_tensor);
+    Tensor beta2_tensor(framework::proto::VarType::FP32);
+    beta2_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<T>{beta2}, ctx.device_context(),
+                     &beta2_tensor);
+
+    Tensor epsilon_tensor(framework::proto::VarType::FP32);
+    epsilon_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<T>{epsilon}, ctx.device_context(),
+                     &epsilon_tensor);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner =
+        NpuOpRunner("ApplyAdamD",
+                    {
+                        *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
+                        beta1_tensor, beta2_tensor, epsilon_tensor, *grad,
+                    },
+                    {
+                        *param_out, *mom1_out, *mom2_out,
+                    },
+                    {});
+    runner.Run(stream);
+
+    // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
+    // if param and param_out is not same, we need to do copy.
+    if (param_out->data<T>() != param->data<T>()) {
+      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+      framework::TensorCopySync(*param, ctx.GetPlace(), param_out);
+    }
+    if (mom1_out->data<T>() != mom1->data<T>()) {
+      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+      framework::TensorCopySync(*mom1, ctx.GetPlace(), mom1_out);
+    }
+    if (mom2_out->data<T>() != mom2->data<T>()) {
+      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+      framework::TensorCopySync(*mom2, ctx.GetPlace(), mom2_out);
+    }
+    auto runner_m1 =
+        NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {});
+    runner_m1.Run(stream);
+    auto runner_m2 =
+        NpuOpRunner("Mul", {*beta2_pow, beta2_tensor}, {*beta2_pow_out}, {});
+    runner_m2.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    adam, ops::AdamNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 1740f2982b6f39..3baba424e8f43d 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -121,19 +121,25 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
       } else {
         T cpu_beta1_pow_out_data;
         T cpu_beta2_pow_out_data;
-        xpu_memcpy(&cpu_beta1_pow_out_data, beta1_pow_ptr, sizeof(T),
-                   XPU_DEVICE_TO_HOST);
+        memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
+                     BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
+                     beta1_pow_ptr, sizeof(T));
+
         cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
-        xpu_memcpy(&cpu_beta2_pow_out_data, beta2_pow_ptr, sizeof(T),
-                   XPU_DEVICE_TO_HOST);
+        memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
+                     BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
+                     beta2_pow_ptr, sizeof(T));
+
         cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
 
         T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
         T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-        xpu_memcpy(beta1_pow_out_p, &cpu_beta1_pow_out_data, sizeof(T),
-                   XPU_HOST_TO_DEVICE);
-        xpu_memcpy(beta2_pow_out_p, &cpu_beta2_pow_out_data, sizeof(T),
-                   XPU_HOST_TO_DEVICE);
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                     beta1_pow_out_p, platform::CPUPlace(),
+                     &cpu_beta1_pow_out_data, sizeof(T));
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                     beta2_pow_out_p, platform::CPUPlace(),
+                     &cpu_beta2_pow_out_data, sizeof(T));
       }
 
       PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 569dbcd6a3ee10..9603411ec4513a 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+
 namespace paddle {
 namespace operators {
 
@@ -127,4 +129,6 @@ REGISTER_OPERATOR(
     ops::SGDOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
     sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::bfloat16>,
     ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 1aaf95efc32507..076121c0e27da7 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -13,14 +13,220 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
 namespace operators {
 
+namespace detail {
+
+template <typename T, int VariableTypeId>
+struct sgd_dense_param_kernel {
+  void operator()() const {}
+};
+
+// LodTensor
+template <typename T>
+struct sgd_dense_param_kernel<
+    T, framework::VarTypeTrait<framework::LoDTensor>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, LoDTensor>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+
+    const auto sz = param_out->numel();
+    jit::sgd_attr_t attr(1, sz, 1, sz, 1);
+    const T *lr = learning_rate->data<T>();
+    const T *param_data = param->data<T>();
+    const T *grad_data = grad->data<T>();
+    int64_t rows_idx = 0;
+    T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto sgd =
+        jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);
+    sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
+  }
+};
+
+// SelectedRows
+template <typename T>
+struct sgd_dense_param_kernel<
+    T, framework::VarTypeTrait<framework::SelectedRows>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, SelectedRows>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    const auto &grad_value = grad->value();
+    const auto &grad_rows = grad->rows();
+    const T *param_data = param->data<T>();
+    const T *grad_data = grad_value.data<T>();
+    const T *lr = learning_rate->data<T>();
+    const int64_t *rows_data = grad_rows.data();
+    T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+    jit::sgd_attr_t attr;
+    attr.param_height = param_out->dims()[0];
+    attr.param_width = param_out->numel() / attr.param_height;
+    attr.grad_height = grad_rows.size();  // note: it is not grad->height()
+    attr.grad_width = grad_value.numel() / attr.grad_height;
+    attr.selected_rows_size = grad_rows.size();
+
+    auto sgd =
+        jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);
+    sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
+  }
+};
+
+// LodTensor
+template <>
+struct sgd_dense_param_kernel<
+    platform::bfloat16, framework::VarTypeTrait<framework::LoDTensor>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, LoDTensor>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    param_out->mutable_data<platform::bfloat16>(ctx.GetPlace());
+
+    auto p = framework::EigenVector<platform::bfloat16>::Flatten(*param);
+    auto g = framework::EigenVector<platform::bfloat16>::Flatten(*grad);
+    auto o = framework::EigenVector<platform::bfloat16>::Flatten(*param_out);
+    const auto *lr = learning_rate->data<platform::bfloat16>();
+
+    o = p - lr[0] * g;
+  }
+};
+
+// SelectedRows
+template <>
+struct sgd_dense_param_kernel<
+    platform::bfloat16, framework::VarTypeTrait<framework::SelectedRows>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, SelectedRows>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    const auto &grad_value = grad->value();
+    const auto &grad_rows = grad->rows();
+    const auto grad_height = grad->height();
+    const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+    const auto grad_width = grad_value.numel() / grad_val_height;
+
+    const auto *grad_data = grad_value.data<platform::bfloat16>();
+    auto *out_data = param_out->data<platform::bfloat16>();
+    const auto *lr = learning_rate->data<platform::bfloat16>();
+
+    for (size_t i = 0; i < grad_rows.size(); ++i) {
+      PADDLE_ENFORCE_LT(
+          grad_rows[i], grad_height,
+          platform::errors::OutOfRange(
+              "Grad rows index value should be less than grad height."
+              "Got [%s], but expected less than [%s]",
+              grad_rows[i], grad_height));
+      const int64_t row = grad_rows[i];
+      for (int64_t j = 0; j < grad_width; ++j) {
+        out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
+      }
+    }
+  }
+};
+
+template <typename T>
+void sgd_op_invoke_dense_param_kernel(const framework::ExecutionContext &ctx) {
+  const auto *param = ctx.Input<framework::Tensor>("Param");
+  auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+  const auto *grad_var = ctx.InputVar("Grad");
+
+  if (grad_var->IsType<framework::LoDTensor>()) {
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    const auto sz = param_out->numel();
+    PADDLE_ENFORCE_EQ(param->numel(), sz,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Param's numel of SgdOp "
+                          "should be equal with ParamOut's numel. "
+                          "But received Param's "
+                          "numel = [%s], ParamOut's numel = [%s]",
+                          param->numel(), sz));
+    PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Grad's numel of SgdOp "
+                          "should be equal with ParamOut's numel. "
+                          "But received Grad's "
+                          "numel = [%s], ParamOut's numel = [%s]",
+                          grad->numel(), sz));
+
+    sgd_dense_param_kernel<
+        T, framework::VarTypeTrait<framework::LoDTensor>::kId>()(ctx);
+  } else if (grad_var->IsType<framework::SelectedRows>()) {
+    // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+    // This manual optimization brings difficulty to track data dependency.
+    // It's better to find a more elegant solution.
+    PADDLE_ENFORCE_EQ(param, param_out,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Param of SgdOp "
+                          "should be equal with ParamOut if variable's "
+                          "type is SelectedRows. "));
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad->rows().size() == 0) {
+      return;
+    }
+
+    auto out_dims = param_out->dims();
+    PADDLE_ENFORCE_EQ(
+        grad->height(), out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input tensor Grad's height of SgdOp "
+            "should be equal with ParamOut's dims. But received  Grad's "
+            "height [%s] and ParamOut's dims [%s]",
+            grad->height(), out_dims[0]));
+
+    auto &grad_value = grad->value();
+    auto &grad_rows = grad->rows();
+    const auto param_height = param_out->dims()[0];
+    const auto param_width = param_out->numel() / param_height;
+    // note: it is not grad->height()
+    const auto grad_height = static_cast<int64_t>(grad_rows.size());
+    const auto grad_width = grad_value.numel() / grad_height;
+
+    PADDLE_ENFORCE_EQ(
+        grad_width, param_width,
+        platform::errors::InvalidArgument(
+            "The grad_value's numel of SgdOp "
+            "should be equal with param_out's numel. But received "
+            "grad_value's numel [%s] and param_out's numel [%s]",
+            grad_width, param_width));
+
+    sgd_dense_param_kernel<
+        T, framework::VarTypeTrait<framework::SelectedRows>::kId>()(ctx);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        false, true, platform::errors::PermissionDenied(
+                         "Unsupported Variable Type of Grad in SgdOp. Excepted "
+                         "LodTensor or SelectedRows, But received [%s]",
+                         paddle::framework::ToTypeName(grad_var->Type())));
+  }
+}
+
+}  // namespace detail
+
 template <typename DeviceContext, typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
@@ -38,102 +244,12 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
     const auto *grad_var = ctx.InputVar("Grad");
 
     if (param_var->IsType<framework::LoDTensor>()) {
-      const auto *param = ctx.Input<framework::Tensor>("Param");
-      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-      // Actually, all tensors are LoDTensor except SelectedRows.
-      if (grad_var->IsType<framework::LoDTensor>()) {
-        const auto *grad = ctx.Input<framework::Tensor>("Grad");
-        auto sz = param_out->numel();
-        PADDLE_ENFORCE_EQ(param->numel(), sz,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Param's numel of SgdOp "
-                              "should be equal with ParamOut's numel. "
-                              "But received Param's "
-                              "numel = [%s], ParamOut's numel = [%s]",
-                              param->numel(), sz));
-        PADDLE_ENFORCE_EQ(grad->numel(), sz,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Grad's numel of SgdOp "
-                              "should be equal with ParamOut's numel. "
-                              "But received Grad's "
-                              "numel = [%s], ParamOut's numel = [%s]",
-                              grad->numel(), sz));
-
-        jit::sgd_attr_t attr(1, sz, 1, sz, 1);
-        const T *lr = learning_rate->data<T>();
-        const T *param_data = param->data<T>();
-        const T *grad_data = grad->data<T>();
-        int64_t rows_idx = 0;
-        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-        auto sgd =
-            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
-                attr);
-        sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
-      } else if (grad_var->IsType<framework::SelectedRows>()) {
-        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-        // This manual optimization brings difficulty to track data dependency.
-        // It's better to find a more elegant solution.
-        PADDLE_ENFORCE_EQ(param, param_out,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Param of SgdOp "
-                              "should be equal with ParamOut if variable's "
-                              "type is SelectedRows. "));
-        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
-        auto &grad_rows = grad->rows();
-
-        // for distributed training, a sparse var may be empty,
-        // just skip updating.
-        if (grad_rows.size() == 0) {
-          return;
-        }
-
-        auto out_dims = param_out->dims();
-        PADDLE_ENFORCE_EQ(
-            grad->height(), out_dims[0],
-            platform::errors::InvalidArgument(
-                "The input tensor Grad's height of SgdOp "
-                "should be equal with ParamOut's dims. But received  Grad's "
-                "height [%s] and ParamOut's dims [%s]",
-                grad->height(), out_dims[0]));
-        auto &grad_value = grad->value();
-        const T *param_data = param->data<T>();
-        const T *grad_data = grad_value.data<T>();
-        const T *lr = learning_rate->data<T>();
-        const int64_t *rows_data = grad_rows.data();
-        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-        jit::sgd_attr_t attr;
-        attr.param_height = out_dims[0];
-        attr.param_width = param_out->numel() / attr.param_height;
-        attr.grad_height = grad_rows.size();  // note: it is not grad->height()
-        attr.grad_width = grad_value.numel() / attr.grad_height;
-        attr.selected_rows_size = grad_rows.size();
-        PADDLE_ENFORCE_EQ(
-            attr.grad_width, attr.param_width,
-            platform::errors::InvalidArgument(
-                "The grad_value's numel of SgdOp "
-                "should be equal with param_out's numel. But received "
-                "grad_value's numel [%s] and param_out's numel [%s]",
-                attr.grad_width, attr.param_width));
-
-        auto sgd =
-            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
-                attr);
-        sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
-      } else {
-        PADDLE_ENFORCE_EQ(
-            false, true,
-            platform::errors::PermissionDenied(
-                "Unsupported Variable Type of Grad in SgdOp. Excepted "
-                "LodTensor or SelectedRows, But received [%s]",
-                paddle::framework::ToTypeName(grad_var->Type())));
-      }
+      detail::sgd_op_invoke_dense_param_kernel<T>(ctx);
     } else if (param_var->IsType<framework::SelectedRows>()) {
       PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
                         platform::errors::InvalidArgument(
-                            "when param is SelectedRows, "
-                            "gradient should also be SelectedRows"));
+                            "When param is SelectedRows, gradient should also "
+                            "be SelectedRows"));
       const auto &param = param_var->Get<framework::SelectedRows>();
       auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
       const auto &grad = grad_var->Get<framework::SelectedRows>();
@@ -179,5 +295,6 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
     }
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
new file mode 100644
index 00000000000000..b7aaff5d457918
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SGDNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
+    auto* param_var = ctx.Input<framework::LoDTensor>("Param");
+    auto* grad_var = ctx.Input<framework::LoDTensor>("Grad");
+    auto* param_out = ctx.Output<framework::LoDTensor>("ParamOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner =
+        NpuOpRunner("ApplyGradientDescent",
+                    {*param_var, *learning_rate, *grad_var}, {*param_out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+
+    // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
+    // if param and param_out is not same, we need to do copy.
+    if (param_out->data<T>() != param_var->data<T>()) {
+      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+      framework::TensorCopySync(*param_var, ctx.GetPlace(), param_out);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    sgd, ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 8ceb22d8cc4c33..1bdb3728f538e2 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/miopen_helper.h"
 #endif
 
@@ -264,6 +266,34 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
     std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
     const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
+#ifdef PADDLE_WITH_HIP
+    if (pooling_type == "max") {
+      using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap;
+      using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc;
+      auto &all_op_kernels =
+          paddle::framework::OperatorWithKernel::AllOpKernels();
+      std::string op_type = "pool2d_grad";
+      auto kernels_iter = all_op_kernels.find(op_type);
+      PADDLE_ENFORCE_NE(
+          kernels_iter, all_op_kernels.end(),
+          platform::errors::Unavailable(
+              "There are no kernels which are registered in the %s operator.",
+              op_type));
+      OpKernelMap &kernels = kernels_iter->second;
+      paddle::framework::OpKernelType expected_kernel_key(
+          paddle::framework::ToDataType(typeid(T)), ctx.GetPlace());
+      auto kernel_iter = kernels.find(expected_kernel_key);
+      PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                        platform::errors::NotFound(
+                            "Operator (%s) does not have kernel for %s.",
+                            op_type, KernelTypeToString(expected_kernel_key)));
+      std::unique_ptr<OpKernelFunc> kernel_func_(
+          new OpKernelFunc(kernel_iter->second));
+      (*kernel_func_)(ctx);
+      return;
+    }
+#endif
+
     // update paddings
     auto in_x_dims = input->dims();
     framework::DDim data_dims;
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index 159bdcabd657b0..277c93fad6aa83 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -119,6 +119,11 @@ class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Output data type")
         .SetDefault(framework::proto::VarType::FP32);
 
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training.")
+        .SetDefault(false);
+
     AddComment(R"DOC(
 Lookup Tablel Prefetch Operator.
 This operator is used to perform lookup on parameter W,
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index 0f1a096e207692..413b4ab358536c 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -30,6 +30,7 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
 
     auto padding_idx = context.Attr<int64_t>("padding_idx");
     auto table_id = context.Attr<int>("table_id");
+    bool is_test = context.Attr<bool>("is_test");
 
     auto embedding_name = context.InputNames("W").front();
     int64_t emb_dim = 0;
@@ -55,7 +56,8 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
     if (platform::is_cpu_place(context.GetPlace())) {
       fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
                                     static_cast<uint64_t>(padding_idx),
-                                    context.GetPlace(), &inputs, &outputs);
+                                    context.GetPlace(), !is_test, &inputs,
+                                    &outputs);
     } else {
       auto inputs_variable = context.MultiInputVar("Ids");
       auto outputs_variable = context.MultiOutputVar("Outputs");
@@ -93,7 +95,8 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
       // use fleet->PullSparse
       fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
                                     static_cast<uint64_t>(padding_idx),
-                                    cpu_place, &tmp_input_vec, &tmp_output_vec);
+                                    cpu_place, !is_test, &tmp_input_vec,
+                                    &tmp_output_vec);
 
       // cp temp to origin
       for (size_t idx = 0; idx < output_var_size; ++idx) {
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index 48903012b595e7..77021b8961db55 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -47,8 +47,7 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
   box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths,
                       hidden_size, 0);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   auto hidden_size = ctx.Attr<int>("size");
   auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
   gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths,
@@ -91,8 +90,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
   box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values,
                           slot_lengths, hidden_size, 0, batch_size);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   auto hidden_size = ctx.Attr<int>("size");
   auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
   gpu_ps_ptr->PushSparseGrad(ctx.GetPlace(), 0, all_keys, all_grad_values,
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index b3622870d070e6..f676348bc0af2a 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -178,9 +178,10 @@ class PyFuncOpVarTypeInference : public framework::StaticGraphVarTypeInference {
 class PyFuncOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(!ctx->IsRuntime(), true,
-                      platform::errors::InvalidArgument(
-                          "Infer shape cannot be called in runtime."));
+    PADDLE_ENFORCE_EQ(
+        !ctx->IsRuntime(), true,
+        platform::errors::InvalidArgument("Shape inference cannot be called at "
+                                          "run time in 'py_func' operator."));
   }
 };
 
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
new file mode 100644
index 00000000000000..0d5c23bed6016e
--- /dev/null
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "paddle/fluid/operators/py_layer_op.h"
+
+namespace paddle {
+namespace operators {
+
+namespace py = ::pybind11;
+
+void RunPyObject(py::object *py_object,
+                 const std::vector<framework::Variable *> &ins,
+                 std::vector<framework::Variable *> *outs) {
+  py::gil_scoped_acquire guard;
+
+  auto py_function = py_object->attr("backward");
+
+  py::tuple inputs(ins.size());
+  for (size_t i = 0; i < ins.size(); i++) {
+    auto in_var = ins[i];
+    if (in_var != nullptr) {
+      auto name = paddle::string::Sprintf("generator_custom_py_layer_%d@GRAD",
+                                          static_cast<int>(i));
+
+      std::shared_ptr<imperative::VariableWrapper> temp_wrap =
+          std::make_shared<imperative::VariableWrapper>(name, *in_var);
+      temp_wrap->InnerSetOverridedStopGradient(true);
+      std::shared_ptr<imperative::VarBase> temp_varbase =
+          std::make_shared<imperative::VarBase>(temp_wrap);
+      try {
+        inputs[i] = py::cast(temp_varbase).ptr();
+      } catch (py::cast_error &) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.backward` should be `Tensor`."));
+      }
+    }
+  }
+
+  auto py_result = py_function(*py_object, *inputs);
+
+  if (PyTuple_Check(py_result.ptr()) || PyList_Check(py_result.ptr())) {
+    auto result_tuple = py_result.cast<py::tuple>();
+    if (result_tuple.size() != outs->size()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The number of outputs of `PyLayer.backward` should be %d, but "
+          "received %d.",
+          outs->size(), result_tuple.size()));
+    }
+    for (size_t i = 0; i < result_tuple.size(); i++) {
+      if (Py_None != result_tuple[i].ptr()) {
+        try {
+          auto result_var =
+              result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
+          *(*outs)[i] = result_var->Var();
+        } catch (py::cast_error &) {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "The output of `PyLayer.backward` should be `Tensor`."));
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.backward` can not be `None`."));
+      }
+    }
+  } else {
+    if (Py_None != py_result.ptr()) {
+      try {
+        auto result_var =
+            py_result.cast<std::shared_ptr<imperative::VarBase>>();
+        *((*outs)[0]) = result_var->Var();
+      } catch (py::cast_error &) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.backward` should be `Tensor`."));
+      }
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The output of `PyLayer.backward` can not be `None`."));
+    }
+  }
+}
+
+void PyLayerGradOpMaker<paddle::imperative::OpBase>::Apply(
+    GradOpPtr<paddle::imperative::OpBase> grad_op) const {
+  grad_op->SetType("py_layer");
+  auto &inner_op = grad_op->InnerOp();
+  auto py_layer_op_const = dynamic_cast<const PyLayerOp *>(&inner_op);
+
+  if (py_layer_op_const) {
+    auto py_layer_op = const_cast<PyLayerOp *>(py_layer_op_const);
+    py_layer_op->SetPyLayerContext(py_context_);
+
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "PyLayerGradOpMaker can't cast %s to PyLayerOp*.",
+        typeid(&inner_op).name()));
+  }
+
+  auto fwd_out_grads = this->OutputGrad("Out");
+  using return_type = decltype(fwd_out_grads);
+  return_type bwd_ins;
+
+  bwd_ins.insert(bwd_ins.begin(), fwd_out_grads.begin(), fwd_out_grads.end());
+
+  auto bwd_outs = this->InputGrad("X", false);
+
+  grad_op->SetInput("X", bwd_ins);
+  grad_op->SetOutput("Out", bwd_outs);
+}
+
+class PyLayerOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Inputs of PyLayer op.").AsDuplicable();
+    AddOutput("Out", "Outputs of PyLayer op").AsDuplicable();
+    AddComment(R"DOC("PyLayer Op")DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PyLayerOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &op_ = ctx.GetOp();
+    auto pylayer_op = dynamic_cast<const PyLayerOp *>(&op_);
+    if (pylayer_op) {
+      auto py_layer_context = pylayer_op->GetPyLayerContext();
+      py::object bk_ctx(py::handle(py_layer_context->GetMutableCtx()), true);
+      auto &input_vars = ctx.MultiInputVar("X");
+      auto output_vars = ctx.MultiOutputVar("Out");
+      RunPyObject(&bk_ctx, input_vars, &output_vars);
+
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "PyLayerOpKernel can't cast %s to PyLayer*.", typeid(&op_).name()));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(py_layer, ops::PyLayerOp, ops::PyLayerOpMaker,
+                  ops::PyLayerGradOpMaker<paddle::imperative::OpBase>,
+                  ops::PyLayerGradOpMaker<paddle::framework::OpDesc>);
+
+REGISTER_OP_CPU_KERNEL(
+    py_layer, ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::float16>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::bfloat16>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
+
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int16_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::complex64>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::complex128>);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL(
+    py_layer, ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::float16>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::bfloat16>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::complex64>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::complex128>);
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h
new file mode 100644
index 00000000000000..133435aa84d71e
--- /dev/null
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/python_headers.h"
+
+namespace paddle {
+namespace operators {
+namespace py = ::pybind11;
+
+class PyLayerContext {
+ public:
+  explicit PyLayerContext(PyObject* context) : context_(context) {
+    Py_INCREF(context_);
+  }
+
+  PyLayerContext() = delete;
+
+  PyObject* GetMutableCtx() { return context_; }
+
+ private:
+  PyObject* context_;
+};
+
+class PyLayerOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    VLOG(3) << "`InferShape` of `PyLayer` is an empty function, and it cannot "
+               "infer the shape of the output tensors.";
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+ public:
+  void SetPyLayerContext(const std::shared_ptr<PyLayerContext>& py_context) {
+    py_context_ = py_context;
+  }
+  const std::shared_ptr<PyLayerContext>& GetPyLayerContext() const {
+    return py_context_;
+  }
+
+ private:
+  std::shared_ptr<PyLayerContext> py_context_;
+};
+
+template <typename T>
+class PyLayerGradOpMaker {};
+template <>
+class PyLayerGradOpMaker<paddle::framework::OpDesc>
+    : public framework::SingleGradOpMaker<paddle::framework::OpDesc> {
+ public:
+  using framework::SingleGradOpMaker<
+      paddle::framework::OpDesc>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<paddle::framework::OpDesc> grad_op) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "`PyLayer` don't support static graph mode."));
+  }
+};
+
+template <>
+class PyLayerGradOpMaker<paddle::imperative::OpBase>
+    : public framework::SingleGradOpMaker<paddle::imperative::OpBase> {
+ public:
+  using framework::SingleGradOpMaker<
+      paddle::imperative::OpBase>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<paddle::imperative::OpBase> grad_op) const override;
+
+ public:
+  void SetPyLayerContext(const std::shared_ptr<PyLayerContext>& py_context) {
+    py_context_ = py_context;
+  }
+
+ private:
+  std::shared_ptr<PyLayerContext> py_context_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
index f2c78e0f70b321..6250d68730e138 100644
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
@@ -34,26 +35,9 @@ class CUDARangeKernel : public framework::OpKernel<T> {
     auto* step_t = context.Input<framework::Tensor>("Step");
     auto* out = context.Output<framework::Tensor>("Out");
 
-    T start, end, step;
-    framework::Tensor n;
-    if (::paddle::platform::is_cpu_place(start_t->place())) {
-      start = start_t->data<T>()[0];
-    } else {
-      framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
-      start = n.data<T>()[0];
-    }
-    if (::paddle::platform::is_cpu_place(end_t->place())) {
-      end = end_t->data<T>()[0];
-    } else {
-      framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
-      end = n.data<T>()[0];
-    }
-    if (::paddle::platform::is_cpu_place(step_t->place())) {
-      step = step_t->data<T>()[0];
-    } else {
-      framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
-      step = n.data<T>()[0];
-    }
+    T start = GetValue<T>(start_t);
+    T end = GetValue<T>(end_t);
+    T step = GetValue<T>(step_t);
 
     int64_t size = 0;
     GetSize(start, end, step, &size);
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index a793d12f522da5..5344147a9069cc 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -34,7 +34,7 @@ void GetSize(T start, T end, T step, int64_t* size) {
   if (start > end) {
     PADDLE_ENFORCE_LT(step, 0,
                       platform::errors::InvalidArgument(
-                          "step should be less than 0 while start > end."));
+                          "The step should be less than 0 while start > end."));
   }
 
   *size = std::is_integral<T>::value
diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc
new file mode 100644
index 00000000000000..228372e1e93e03
--- /dev/null
+++ b/paddle/fluid/operators/range_op_npu.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RangeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* end_t = context.Input<framework::Tensor>("End");
+    auto* step_t = context.Input<framework::Tensor>("Step");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopySync(*start_t, platform::CPUPlace(), &n);
+    T start = n.data<T>()[0];
+    framework::TensorCopySync(*end_t, platform::CPUPlace(), &n);
+    T end = n.data<T>()[0];
+    framework::TensorCopySync(*step_t, platform::CPUPlace(), &n);
+    T step = n.data<T>()[0];
+
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+
+    out->Resize(framework::make_ddim({size}));
+    out->mutable_data<T>(context.GetPlace());
+
+    std::vector<T> odata;
+    T value = start;
+    for (int64_t i = 0; i < size; ++i) {
+      odata.push_back(value);
+      value += step;
+    }
+
+    framework::TensorFromVector(odata, context.device_context(), out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    range, ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, double>)
+
+#endif
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
new file mode 100644
index 00000000000000..562a560b2f1548
--- /dev/null
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(range);
+USE_OP_DEVICE_KERNEL(range, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto start = scope->Var("Start");
+  auto tensor_start = start->GetMutable<f::LoDTensor>();
+  std::vector<T> init_start;
+  init_start.push_back(static_cast<T>(1));
+  TensorFromVector(init_start, ctx, tensor_start);
+  tensor_start->Resize({1});
+
+  auto end = scope->Var("End");
+  auto tensor_end = end->GetMutable<f::LoDTensor>();
+  std::vector<T> init_end;
+  init_end.push_back(static_cast<T>(10));
+  TensorFromVector(init_end, ctx, tensor_end);
+  tensor_end->Resize({1});
+
+  auto step = scope->Var("Step");
+  auto tensor_step = step->GetMutable<f::LoDTensor>();
+  std::vector<T> init_step;
+  init_step.push_back(static_cast<T>(2));
+  TensorFromVector(init_step, ctx, tensor_step);
+  tensor_step->Resize({1});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}},
+      {{"Out", {"Out"}}}, {});
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  EXPECT_EQ(static_cast<T>(out_vec.size()), static_cast<T>(5));
+  EXPECT_EQ(static_cast<T>(out_vec[0]), static_cast<T>(1.0));
+  EXPECT_EQ(static_cast<T>(out_vec[1]), static_cast<T>(3.0));
+  EXPECT_EQ(static_cast<T>(out_vec[2]), static_cast<T>(5.0));
+  EXPECT_EQ(static_cast<T>(out_vec[3]), static_cast<T>(7.0));
+  EXPECT_EQ(static_cast<T>(out_vec[4]), static_cast<T>(9.0));
+}
+
+TEST(range, NPU) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<int>(&scope, ctx, "range");
+}
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
index 92107c9dc442ee..846d362fb522db 100644
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@@ -42,3 +42,7 @@ endif()
 if(WITH_ROCM)
     hip_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
 endif()
+
+if(WITH_ASCEND_CL)
+    cc_test(reduce_any_op_npu_test SRCS reduce_any_op_npu_test.cc DEPS op_registry reduce_any_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
new file mode 100644
index 00000000000000..b18c16c8c71f7a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMaxMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_max);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_max, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMaxMKLDNNKernel<float>,
+                   ops::ReduceMaxMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
new file mode 100644
index 00000000000000..a9eed0d7eb0427
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMeanMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_mean);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_mean, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMeanMKLDNNKernel<float>,
+                   ops::ReduceMeanMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
new file mode 100644
index 00000000000000..ce63a1485471f7
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMinMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_min);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_min, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMinMKLDNNKernel<float>,
+                   ops::ReduceMinMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
new file mode 100644
index 00000000000000..7e09aaa126effe
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::LoDTensor;
+using paddle::framework::Tensor;
+using platform::to_void_cast;
+
+template <typename T>
+class ReduceMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void RunKernel(const framework::ExecutionContext& ctx,
+                 dnnl::algorithm reduction_type) const {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    const auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+
+    std::vector<int64_t> output_dims =
+        CalculateOutputDims(input, output, reduce_dims, reduce_all, keep_dim);
+
+    auto input_dims = framework::vectorize(input->dims());
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    // oneDNN reduce op does not support edge case in which memory is being
+    // copied without actual reduction.
+    // In that case reorder must be executed to maintain compatibility with
+    // PaddlePaddle reduce op
+    if (input_dims == output_dims) {
+      mkldnn::memory::data_type input_type =
+          framework::ToMKLDNNDataType(input->type());
+      std::string key = platform::CreateKey(
+          dev_ctx, input_dims, input->format(), input->format(), input_type);
+      platform::ReorderMKLDNNHandler reorder_handler(
+          input_dims, input->type(), input_type, dev_ctx, onednn_engine, key);
+
+      auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+          input->format(), platform::to_void_cast(input->data<T>()));
+
+      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+          output, input->format(), ctx.GetPlace());
+
+      auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                      reorder_dst_memory_p);
+
+      platform::RecordEvent record_reorder("int_reorder",
+                                           platform::EventRole::kUniqueOp);
+
+      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+      astream.wait();
+
+      output->set_layout(framework::DataLayout::kMKLDNN);
+      output->set_format(
+          platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc().reshape(
+              paddle::framework::vectorize<int64_t>(output->dims()))));
+    } else {
+      platform::ReductionMKLDNNHandler<T> handler(
+          reduction_type, 0.0f, 0.0f, dev_ctx, onednn_engine, ctx.GetPlace(),
+          input, output, ctx.InputName("X"), output_dims);
+
+      auto src_memory_p = handler.AcquireSrcMemory(input);
+      auto dst_memory_p = handler.AcquireDstMemory(output);
+
+      std::unordered_map<int, dnnl::memory> reduction_args = {
+          {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
+
+      auto reduction_p = handler.AcquireForwardPrimitive();
+
+      reduction_p->execute(astream, reduction_args);
+      astream.wait();
+      output->set_layout(framework::DataLayout::kMKLDNN);
+      output->set_format(
+          platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
+              paddle::framework::vectorize<int64_t>(output->dims()))));
+    }
+  }
+
+ private:
+  std::vector<int64_t> CalculateOutputDims(const Tensor* input,
+                                           const Tensor* output,
+                                           std::vector<int>& reduce_dims,
+                                           bool reduce_all,
+                                           bool keep_dim) const {
+    if (keep_dim) return framework::vectorize(output->dims());
+
+    if (reduce_all)
+      return std::vector<int64_t>(framework::vectorize(input->dims()).size(),
+                                  1);
+
+    std::vector<int64_t> output_dims(framework::vectorize(input->dims()));
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      reduce_dims[i] = (reduce_dims[i] >= 0)
+                           ? reduce_dims[i]
+                           : input->dims().size() + reduce_dims[i];
+      output_dims[reduce_dims[i]] = 1;
+    }
+
+    return output_dims;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
new file mode 100644
index 00000000000000..4676589e68910a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceSumMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_sum);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_sum, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceSumMKLDNNKernel<float>,
+                   ops::ReduceSumMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
new file mode 100644
index 00000000000000..39e74c908ae7ab
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ReduceAnyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // set attr
+    NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
+
+    auto runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
new file mode 100644
index 00000000000000..d408ff3988f030
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+using Tensor = paddle::framework::Tensor;
+
+USE_OP(reduce_any);
+USE_OP_DEVICE_KERNEL(reduce_any, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  std::vector<bool> init_x = {true, false, false, false};
+  f::TensorFromVector<bool>(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({2}));
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  std::vector<int> axes;
+  f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}};
+  auto op = f::OpRegistry::CreateOp("reduce_any", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  std::vector<bool> out_vec;
+  f::TensorToVector<bool>(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  std::vector<bool> expected_vec = {true};
+  EXPECT_EQ(out_vec.size(), expected_vec.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], expected_vec[i]);
+  }
+}
+
+TEST(reduce_any, NPU) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<bool>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 25f9453571ac63..280464ea852793 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -489,6 +489,30 @@ class ReduceOp : public framework::OperatorWithKernel {
       }
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+    if (ctx.Input<paddle::framework::LoDTensor>("X")->dims().size() > 5)
+      return framework::OpKernelType(input_data_type, ctx.GetPlace());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                        platform::errors::InvalidArgument(
+                            "float16 can only be used on GPU place"));
+    }
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ReduceOpUseInputPlace : public ReduceOp {
@@ -579,6 +603,9 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
         "(int, default -1)"
         "The dtype of output, default value is -1, the dtype is same as intput")
         .SetDefault(-1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 %s Operator.
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
new file mode 100644
index 00000000000000..f3b6e69a48bcb0
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ReduceSumNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dims = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // special case
+    if (x->dims().size() == 1 && keep_dims == false) {
+      keep_dims = true;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    framework::Tensor cast_x;
+    framework::Tensor cast_out;
+    // NOTE: ReduceSumD only supports fp32 and fp16
+    if (x->type() != framework::proto::VarType::FP32 &&
+        x->type() != framework::proto::VarType::FP16) {
+      cast_x.Resize(x->dims());
+      cast_x.mutable_data<float>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
+      auto runner_cast = NpuOpRunner(
+          "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+
+      cast_out.Resize(out->dims());
+      cast_out.mutable_data<float>(ctx.GetPlace());
+    } else {
+      cast_x.ShareDataWith(*x);
+      cast_out.ShareDataWith(*out);
+    }
+
+    if (reduce_all) {
+      std::vector<int> dim_vec;
+      for (int i = 0; i < x->dims().size(); i++) {
+        dim_vec.push_back(i);
+      }
+
+      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
+                                {{"axes", dim_vec}, {"keep_dims", keep_dims}});
+      runner.Run(stream);
+
+    } else {
+      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
+                                {{"axes", dims}, {"keep_dims", keep_dims}});
+      runner.Run(stream);
+    }
+
+    if (x->type() != framework::proto::VarType::FP32 &&
+        x->type() != framework::proto::VarType::FP16) {
+      auto dst_dtype = ConvertToNpuDtype(out->type());
+      auto runner_cast =
+          NpuOpRunner("Cast", {cast_out}, {*out},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dims = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (keep_dims || reduce_all) {
+      auto runner = NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad},
+                                {{"shape", framework::vectorize(x->dims())}});
+      runner.Run(stream);
+    } else {
+      framework::DDim out_dims;
+      out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
+          dims, out_grad->dims());
+
+      Tensor out_grad_tmp(out_grad->type());
+      out_grad_tmp.Resize(out_dims);
+      out_grad_tmp.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *out_grad, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(),
+          &out_grad_tmp);
+      out_grad_tmp.Resize(out_dims);
+
+      auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
+                                {{"shape", framework::vectorize(x->dims())}});
+      runner.Run(stream);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    reduce_sum,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    reduce_sum_grad,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                paddle::platform::float16>);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 94efa70e467bca..e119a21caa23cb 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -377,31 +377,9 @@ class ReshapeKernel {
 
     out->Resize(out_dims);
     out->mutable_data(ctx.GetPlace(), in->type());
-
-#ifdef PADDLE_WITH_XPU
-    if (platform::is_xpu_place(ctx.GetPlace())) {
-      void *out_ptr = out->data<void>();
-      const void *in_ptr = in->data<void>();
-      if ((out_ptr != nullptr) && (in_ptr != nullptr) &&
-          (paddle::framework::SizeOfType(in->type()) > 0)) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::XPUDeviceContext>();
-        int r = xpu::memcpy_device(
-            dev_ctx.x_context(), out_ptr, in_ptr,
-            in->numel() * paddle::framework::SizeOfType(in->type()));
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                          platform::errors::External(
-                              "XPU memcpy_device return wrong value[%d %s]", r,
-                              XPUAPIErrorMsg[r]));
-      }
-    } else {
-#endif
-      framework::TensorCopy(
-          *in, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), out);
-#ifdef PADDLE_WITH_XPU
-    }
-#endif
+    framework::TensorCopy(
+        *in, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), out);
     out->Resize(out_dims);
   }
 };
diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc
new file mode 100644
index 00000000000000..79a4cd116f3b93
--- /dev/null
+++ b/paddle/fluid/operators/reshape_op_npu.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class Reshape2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto list_new_shape_tensor =
+        ctx.MultiInput<framework::Tensor>("ShapeTensor");
+    if (list_new_shape_tensor.size() > 0) {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Input(ShapeTensor) is not supported on NPU."));
+    }
+    PADDLE_ENFORCE_EQ(ctx.Input<framework::LoDTensor>("Shape"), nullptr,
+                      platform::errors::Unimplemented(
+                          "Input(Shape) is not supported on NPU."));
+    auto shape = out->dims();
+    out->mutable_data(ctx.GetPlace(), x->type());
+    framework::TensorCopy(
+        *x, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), out);
+    out->Resize(shape);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Reshape2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto in_dims = d_x->dims();
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopy(
+        *d_out, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), d_x);
+    d_x->Resize(in_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    reshape2, ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    reshape2_grad,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index f619f3d59cece5..194274cdd5bb4d 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -88,6 +88,8 @@ REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker,
 REGISTER_OP_CPU_KERNEL(
     save, ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext,
+                      paddle::platform::float16>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
new file mode 100644
index 00000000000000..ee7210a7784d72
--- /dev/null
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/scale_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ScaleNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto scale = static_cast<float>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<float>(ctx.Attr<float>("bias"));
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    float _power = 1.0;
+    if (bias_after_scale) {
+      out->mutable_data<T>(ctx.GetPlace());
+      auto runner =
+          NpuOpRunner("Power", {*x}, {*out},
+                      {{"power", _power}, {"scale", scale}, {"shift", bias}});
+
+      runner.Run(stream);
+    } else {
+      Tensor tmp_x(x->type());
+      tmp_x.Resize(x->dims());
+      tmp_x.mutable_data<T>(ctx.GetPlace());
+      auto runner_tmp = NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
+      runner_tmp.Run(stream);
+
+      out->mutable_data<T>(ctx.GetPlace());
+      float _bias = 0.0;
+      auto runner =
+          NpuOpRunner("Power", {tmp_x}, {*out},
+                      {{"power", _power}, {"scale", scale}, {"shift", _bias}});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    scale, ops::ScaleNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ScaleNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
index cfa88b9808d646..864a94a4235e65 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -102,9 +102,13 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
 
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i],
-                      platform::errors::InvalidArgument(
-                          "src shape and dst shape should match"));
+    PADDLE_ENFORCE_EQ(
+        src_dims[i], dst_dims[i],
+        platform::errors::InvalidArgument(
+            "The dimensions of the source tensor and target tensor should"
+            " match, but received source tensor's %d-th dimension is %d,"
+            "target tensor's %d-th dimension is %d.",
+            i, src_dims[i], i, dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -146,9 +150,13 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
 
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i],
-                      platform::errors::InvalidArgument(
-                          "src shape and dst shape should match"));
+    PADDLE_ENFORCE_EQ(
+        src_dims[i], dst_dims[i],
+        platform::errors::InvalidArgument(
+            "The dimensions of the source tensor and target tensor should"
+            " match, but received source tensor's %d-th dimension is %d,"
+            "target tensor's %d-th dimension is %d.",
+            i, src_dims[i], i, dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
new file mode 100644
index 00000000000000..e2e49acb94c7b2
--- /dev/null
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/kron_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ScatterNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* index = ctx.Input<Tensor>("Ids");
+    auto* updates = ctx.Input<Tensor>("Updates");
+    bool overwrite = ctx.Attr<bool>("overwrite");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    framework::Tensor tmp_tensor(index->type());
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {index_dims[0], 1};
+      tmp_tensor.Resize(framework::make_ddim(new_dim));
+      index = &tmp_tensor;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (overwrite) {
+      auto runner_update = NpuOpRunner("TensorScatterUpdate",
+                                       {*x, *index, *updates}, {*out}, {});
+      runner_update.Run(stream);
+    } else {
+      auto runner_add =
+          NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
+      runner_add.Run(stream);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    scatter, ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
new file mode 100644
index 00000000000000..061849db6ada80
--- /dev/null
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/shape_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;
+
+template <typename DeviceContext, typename T>
+class ShapeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<SelectedRows>()) {
+      in_dims = in_var->Get<SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    // to do: cpuplace?
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    shape, ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
new file mode 100644
index 00000000000000..e5e0dafdae0b15
--- /dev/null
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -0,0 +1,134 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+void UpdateAttr(const framework::DDim in_dims, const std::vector<int> axes,
+                const std::vector<int> starts, const std::vector<int> ends,
+                std::vector<int>* offsets, std::vector<int>* size) {
+  int cnt = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    int start = 0;
+    int end = in_dims[i];
+    int axis = axes[cnt];
+
+    if (axis == i) {
+      start = starts[cnt];
+      if (start < 0) {
+        start = (start + in_dims[i]);
+      }
+      start = std::max(start, static_cast<int>(0));
+      end = ends[cnt];
+      if (end < 0) {
+        end = (end + in_dims[i]);
+      }
+      end = std::min(end, static_cast<int>(in_dims[i]));
+      cnt++;
+    }
+
+    (*offsets)[i] = start;
+    (*size)[i] = end - start;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SliceNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dims = input->dims();
+    std::vector<int> offsets(in_dims.size());
+    std::vector<int> size(in_dims.size());
+
+    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
+
+    auto runner = NpuOpRunner("SliceD", {*input}, {*out},
+                              {{"offsets", offsets}, {"size", size}});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SliceGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dinput = ctx.Output<Tensor>(framework::GradVarName("Input"));
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    auto in_dims = input->dims();
+    int rank = in_dims.size();
+
+    std::vector<int> offsets(rank);
+    std::vector<int> size(rank);
+    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
+
+    std::vector<std::vector<int64_t>> paddings(rank, std::vector<int64_t>(2));
+    for (int i = 0; i < rank; ++i) {
+      paddings[i][0] = static_cast<int64_t>(offsets[i]);
+      paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
+    }
+
+    dinput->mutable_data<T>(ctx.GetPlace());
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner =
+        NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    slice, ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    slice_grad,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu b/paddle/fluid/operators/softmax_cudnn_op.cu
index b62d71bdbc4dba..83b7b78aaec909 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/operators/softmax_impl.cuh"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #ifdef PADDLE_WITH_HIP
@@ -21,7 +23,6 @@ limitations under the License. */
 #else
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace platform {
@@ -37,288 +38,414 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
 
-#define LAUNCH_SOFTMAX_WARP_FORWARD(Log2Elements)                  \
-  case Log2Elements:                                               \
-    WarpSoftmaxForward<T, float, Log2Elements><<<                  \
-        blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
-        out_data, x->data<T>(), N, dim, dim);                      \
-    break;
-
-#define LAUNCH_SOFTMAX_WARP_BACKWARD(Log2Elements)                 \
-  case Log2Elements:                                               \
-    softmax_warp_backward<T, float, Log2Elements><<<               \
-        blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
-        dx_data, mul_grad.data<T>(), out->data<T>(), N, dim, dim); \
-    break;
-
-static inline int SizeOutAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-int log2_ceil(int value) {
-  int log2_value = 0;
-  while ((1 << log2_value) < value) ++log2_value;
-  return log2_value;
-}
-
-template <typename T, int VLEN>
-union vec_t {
-  static_assert(sizeof(T) == -1, "vec_t is only available by specialization.");
+// Vectorization trait 4 * sizeof(T)
+template <typename T>
+class VecT4 {};
+template <>
+class VecT4<double> {
+ public:
+  using Type = long4;
 };
-
 template <>
-union vec_t<float, 4> {
-  float4 s;
-  float v[4];
+class VecT4<float> {
+ public:
+  using Type = int4;
+};
+template <>
+class VecT4<platform::float16> {
+ public:
+  using Type = int2;
 };
 
+// Vectorization trait 2 * sizeof(T)
+template <typename T>
+class VecT2 {};
 template <>
-union vec_t<platform::float16, 4> {
-  int2 s;
-  platform::float16 v[4];
+class VecT2<double> {
+ public:
+  using Type = int4;
+};
+template <>
+class VecT2<float> {
+ public:
+  using Type = int2;
+};
+template <>
+class VecT2<platform::float16> {
+ public:
+  using Type = int;
 };
 
-template <typename T, typename VECT, int VPT, int WARP_PER_BLOCK>
-__global__ void VecSoftmaxForward(T* dst, const T* src, const int batch_size,
-                                  const int softmax_ele) {
-  int offset = blockIdx.x * softmax_ele * WARP_PER_BLOCK;
-  int idx = threadIdx.x * VPT;
-
-  VECT buf = reinterpret_cast<const VECT*>(&src[offset + idx])[0];
-  T* bufp = reinterpret_cast<T*>(&buf);
-  float4 val4;
-  float* val4p = reinterpret_cast<float*>(&val4);
-  for (int i = 0; i < VPT; ++i) {
-    val4p[i] = static_cast<float>(bufp[i]);
-  }
-  float val = val4.x + val4.y + val4.z + val4.w;
-  float max_val = math::warpReduceMax<float>(
-      max(max(val4.x, val4.y), max(val4.z, val4.w)), 0xffffffff);
-  float4 tmp4 = make_float4(__expf(val4.x - max_val), __expf(val4.y - max_val),
-                            __expf(val4.z - max_val), __expf(val4.w - max_val));
-  float* tmp4p = reinterpret_cast<float*>(&tmp4);
-  float invsum = 1.f / (math::warpReduceSum<float>(
-                            tmp4.x + tmp4.y + tmp4.z + tmp4.w, 0xffffffff) +
-                        1e-6f);
-  for (int i = 0; i < VPT; ++i) {
-    bufp[i] = static_cast<T>(tmp4p[i] * invsum);
-  }
-  reinterpret_cast<VECT*>(&dst[offset + idx])[0] = buf;
+int static inline log2_ceil(int value) {
+  int log2_value = 0;
+  while ((1 << log2_value) < value) ++log2_value;
+  return log2_value;
 }
 
-template <typename T, int WARP_BATCH, int WARP_SIZE_SOFTMAX>
-__device__ __forceinline__ void warp_reduce_sum(T* sum) {
+/*
+Core function of computing softmax forward for axis=-1.
+The computation includes
+  - Compute maximum of batch: maxvalue_{i} = max_j src_{i,j}
+  - Compute sum of exp batch: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
+  - Compute: (a_{i,j} - maxvalue_{i}) / s_{i}
+One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
+For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
+api to compute max (sum) in one warp.
+*/
+template <typename T, typename VecT, typename AccT, int Log2Elements,
+          bool LogMode = false>
+__global__ void WarpSoftmaxForward(T* softmax, const T* src,
+                                   const int batch_size, const int stride,
+                                   const int element_count) {
+  constexpr int kDimCeil = 1 << Log2Elements;
+  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  constexpr int kVSize = sizeof(VecT) / sizeof(T);
+  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kIterationsV =
+      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  constexpr int kBatchSize = (kDimCeil <= 32) ? 2 : 1;
+
+  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
+
+  // max index to read
+  int idx_max_v[kBatchSize];
 #pragma unroll
-  for (int offset = WARP_SIZE_SOFTMAX / 2; offset > 0; offset /= 2) {
-#pragma unroll
-    for (int i = 0; i < WARP_BATCH; ++i) {
-      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
-      sum[i] = sum[i] + sum_val;
-    }
+  for (int i = 0; i < kBatchSize; i++) {
+    int idx_max = ((i + first_batch) < batch_size) ? element_count : 0;
+    idx_max_v[i] = idx_max / kVSize;
   }
-}
 
-template <typename T, int WARP_BATCH, int WARP_SIZE_SOFTMAX>
-__device__ __forceinline__ void warp_reduce_max(T* sum) {
+  // read data from global memory
+  AccT srcdata[kBatchSize][kIterationsV][kVSize];
+
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+// read data
+#pragma unroll
+    for (int it = 0; it < kIterationsV; ++it) {
+      int src_idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {
+        if (src_idx < idx_max_v[i]) {
+          srcdata[i][it][0] =
+              static_cast<AccT>(src[(first_batch + i) * stride + src_idx]);
+        } else {
+          srcdata[i][it][0] = -std::numeric_limits<AccT>::infinity();
+        }
+      } else {
+        const VecT* src_v =
+            reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
+        if (src_idx < idx_max_v[i]) {
+          VecT srctmp = src_v[src_idx];
+          const T* srcinptr = reinterpret_cast<const T*>(&srctmp);
 #pragma unroll
-  for (int offset = WARP_SIZE_SOFTMAX / 2; offset > 0; offset /= 2) {
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = static_cast<AccT>(srcinptr[s]);
+          }
+        } else {
 #pragma unroll
-    for (int i = 0; i < WARP_BATCH; ++i) {
-      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
-      sum[i] = max(sum[i], max_val);
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = -std::numeric_limits<AccT>::infinity();
+          }
+        }
+      }
     }
   }
-}
-
-template <typename T, typename AccT, int Log2Elements>
-__global__ void WarpSoftmaxForward(T* dst, const T* src, const int batch_size,
-                                   const int stride, const int element_count) {
-  constexpr int next_power_of_two = 1 << Log2Elements;
-  constexpr int warp_size_softmax =
-      (next_power_of_two < 32) ? next_power_of_two : 32;
-  constexpr int WARP_ITERATIONS = next_power_of_two / warp_size_softmax;
-  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
 
-  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-
-  int local_batches = batch_size - first_batch;
-  if (local_batches > WARP_BATCH) {
-    local_batches = WARP_BATCH;
-  }
-
-  int local_idx = threadIdx.x;
-
-  src += first_batch * stride + local_idx;
-  dst += first_batch * stride + local_idx;
+  // compute max value
+  AccT max_value[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    AccT valmax = srcdata[i][0][0];
+#pragma unroll
+    for (int s = 1; s < kVSize; ++s) {
+      valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s];
+    }
+    max_value[i] = valmax;
 
-  // load data from global memory
-  AccT elements[WARP_BATCH][WARP_ITERATIONS];
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    int batch_element_count = (i >= local_batches) ? 0 : element_count;
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < batch_element_count) {
-        elements[i][it] =
-            static_cast<float>(src[i * element_count + it * warp_size_softmax]);
-      } else {
-        elements[i][it] = -std::numeric_limits<AccT>::infinity();
+// it = 1, 2, ...
+#pragma unroll
+    for (int it = 1; it < kIterationsV; ++it) {
+      AccT valmax = srcdata[i][it][0];
+#pragma unroll
+      for (int s = 1; s < kVSize; ++s) {
+        valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s];
       }
+      max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
     }
   }
+  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
 
-  // compute max_value
-  AccT max_value[WARP_BATCH];
+  // compute sum
+  AccT sum[kBatchSize];
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    max_value[i] = elements[i][0];
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    if (LogMode) {
+      sum[i] = std::exp(srcdata[i][0][0] - max_value[i]);
+    } else {
+      srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]);
+      sum[i] = srcdata[i][0][0];
+    }
 #pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-      max_value[i] =
-          (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+    for (int s = 1; s < kVSize; ++s) {
+      if (LogMode) {
+        sum[i] += std::exp(srcdata[i][0][s] - max_value[i]);
+      } else {
+        srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]);
+        sum[i] += srcdata[i][0][s];
+      }
     }
-  }
-  warp_reduce_max<AccT, WARP_BATCH, warp_size_softmax>(max_value);
 
-  AccT sum[WARP_BATCH]{0.0f};
+// it = 1, 2, ...
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
+    for (int it = 1; it < kIterationsV; ++it) {
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      elements[i][it] = (std::exp((elements[i][it] - max_value[i])));
-      sum[i] += elements[i][it];
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          sum[i] += std::exp(srcdata[i][it][s] - max_value[i]);
+        } else {
+          srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]);
+          sum[i] += srcdata[i][it][s];
+        }
+      }
     }
   }
-  warp_reduce_sum<AccT, WARP_BATCH, warp_size_softmax>(sum);
+  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-// store result
+// write result to global memory
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    if (i >= local_batches) break;
+  for (int i = 0; i < kBatchSize; ++i) {
+    if (LogMode) {
+      sum[i] = std::log(sum[i]);
+    }
+
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < element_count) {
-        dst[i * element_count + it * warp_size_softmax] =
-            elements[i][it] / sum[i];
+    for (int it = 0; it < kIterationsV; ++it) {
+      int idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {
+        if (idx < idx_max_v[i]) {
+          if (LogMode) {
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] - max_value[i] - sum[i];
+          } else {
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] / sum[i];
+          }
+        } else {
+          break;
+        }
       } else {
-        break;
+        VecT* softmax_v =
+            reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
+        VecT tmpdata;
+        T* tmpptr = reinterpret_cast<T*>(&tmpdata);
+#pragma unroll
+        for (int s = 0; s < kVSize; ++s) {
+          if (LogMode) {
+            tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i];
+          } else {
+            tmpptr[s] = srcdata[i][it][s] / sum[i];
+          }
+        }
+
+        if (idx < idx_max_v[i]) {
+          softmax_v[idx] = tmpdata;
+        } else {
+          break;
+        }
       }
     }
   }
 }
 
-template <typename T, typename AccT, int Log2Elements>
-__global__ void softmax_warp_backward(T* gradInput, const T* grad,
-                                      const T* output, int batch_size,
-                                      int stride, int element_count) {
-  constexpr int next_power_of_two = 1 << Log2Elements;
-  constexpr int warp_size_softmax =
-      (next_power_of_two < 32) ? next_power_of_two : 32;
-  constexpr int WARP_ITERATIONS = next_power_of_two / warp_size_softmax;
-  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
-
-  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-
+/*
+Core function of computing softmax backward for axis=-1.
+The computation includes
+  - Compute sum of exp batch: s_{i} = sum_{j} {src_{i,j} * grad_{i,j}
+  - Compute src_{i,j} * ( grad_{i,j}) - s_{i} )
+One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
+For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
+api to compute max (sum) in one warp.
+*/
+template <typename T, typename VecT, typename AccT, int Log2Elements,
+          bool LogMode = false>
+__global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
+                                    int batch_size, int stride,
+                                    int element_count) {
+  constexpr int kVSize = sizeof(VecT) / sizeof(T);
+  constexpr int kDimCeil = 1 << Log2Elements;
+  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1;
+  constexpr int kIterationsV =
+      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  int element_count_v = element_count / kVSize;
+
+  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
   int local_batches = batch_size - first_batch;
-  if (local_batches > WARP_BATCH) {
-    local_batches = WARP_BATCH;
+  if (local_batches > kBatchSize) {
+    local_batches = kBatchSize;
   }
 
-  int local_idx = threadIdx.x % warp_size_softmax;
-
-  int thread_offset = first_batch * stride + local_idx;
-  grad += thread_offset;
-  output += thread_offset;
-  gradInput += thread_offset;
-
-  // load data from global memory
-  AccT grad_reg[WARP_BATCH][WARP_ITERATIONS];
-  AccT output_reg[WARP_BATCH][WARP_ITERATIONS];
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    int batch_element_count = (i >= local_batches) ? 0 : element_count;
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < batch_element_count) {
-        grad_reg[i][it] =
-            static_cast<AccT>(grad[i * element_count + it * warp_size_softmax]);
-        output_reg[i][it] = static_cast<AccT>(
-            output[i * element_count + it * warp_size_softmax]);
+  // read data from global memory
+  VecT src_reg[kBatchSize][kIterationsV];
+  VecT grad_reg[kBatchSize][kIterationsV];
+
+  for (int i = 0; i < kBatchSize; ++i) {
+    const VecT* src_v =
+        reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
+    const VecT* grad_v =
+        reinterpret_cast<const VecT*>(&grad[(first_batch + i) * stride]);
+
+    // max index to read
+    int idx_max = (i < local_batches) ? element_count : 0;
+    int idx_max_v = idx_max / kVSize;
+
+    // read data
+    for (int it = 0; it < kIterationsV; ++it) {
+      int src_idx = threadIdx.x + it * kWarpSize;
+      if (src_idx < idx_max_v) {
+        src_reg[i][it] = src_v[src_idx];
+        grad_reg[i][it] = grad_v[src_idx];
       } else {
-        grad_reg[i][it] = AccT(0);
-        output_reg[i][it] = AccT(0);
+#pragma unroll
+        for (int s = 0; s < kVSize; s++) {
+          reinterpret_cast<T*>(&src_reg[i][it])[s] = 0.0;
+          reinterpret_cast<T*>(&grad_reg[i][it])[s] = 0.0;
+        }
       }
     }
   }
 
-  AccT sum[WARP_BATCH];
+  // compute sum
+  AccT sum[kBatchSize]{0.0};
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    sum[i] = grad_reg[i][0];
+    for (int it = 0; it < kIterationsV; ++it) {
+      T* gradptr = reinterpret_cast<T*>(&grad_reg[i][it]);
+      T* srcptr = reinterpret_cast<T*>(&src_reg[i][it]);
 #pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-      sum[i] += grad_reg[i][it];
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          sum[i] += static_cast<AccT>(gradptr[s]);
+        } else {
+          sum[i] += static_cast<AccT>(gradptr[s] * srcptr[s]);
+        }
+      }
     }
   }
-  warp_reduce_sum<AccT, WARP_BATCH, warp_size_softmax>(sum);
+  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-// store result
+// write result
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
+  for (int i = 0; i < kBatchSize; ++i) {
     if (i >= local_batches) break;
+
+    VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
+
+    // max index to write
+    int idx_max = (i < local_batches) ? element_count : 0;
+    int idx_max_v = idx_max / kVSize;
+
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < element_count) {
-        // compute gradients
-        gradInput[i * element_count + it * warp_size_softmax] =
-            (grad_reg[i][it] - output_reg[i][it] * sum[i]);
+    for (int it = 0; it < kIterationsV; ++it) {
+      VecT tmpdata;
+      T* tmpptr = reinterpret_cast<T*>(&tmpdata);
+      T* gradptr = reinterpret_cast<T*>(&grad_reg[i][it]);
+      T* srcptr = reinterpret_cast<T*>(&src_reg[i][it]);
+#pragma unroll
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          tmpptr[s] = static_cast<AccT>(gradptr[s]) -
+                      std::exp(static_cast<AccT>(srcptr[s])) * sum[i];
+        } else {
+          tmpptr[s] = static_cast<AccT>(srcptr[s]) *
+                      (static_cast<AccT>(gradptr[s]) - sum[i]);
+        }
+      }
+
+      int idx = threadIdx.x + it * kWarpSize;
+      if (idx < idx_max_v) {
+        dst_v[idx] = tmpdata;
       }
     }
   }
 }
 
-template <typename T>
-__global__ void MultiplyCUDAKernel(T* C, const T* A, const T* B, int N) {
-  CUDA_KERNEL_LOOP(i, N) {
-    C[i] = static_cast<T>(static_cast<float>(A[i]) * static_cast<float>(B[i]));
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)                         \
+  case Log2Elements:                                                          \
+    WarpSoftmaxForward<                                                       \
+        T, VecT, AccT, Log2Elements,                                          \
+        LogMode><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
+        dst, src, batch_size, stride, element_count);                         \
+    break;
+
+/*
+  Wrapper of softmax formward with template instantiation on size of input.
+*/
+template <typename T, typename VecT, bool LogMode>
+void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
+                              const framework::ExecutionContext& ctx, T* dst,
+                              const T* src, const int batch_size,
+                              const int stride, const int element_count,
+                              int Log2Elements) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  switch (Log2Elements) {
+    SOFTMAX_WARP_FORWARD_CASE(0, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(1, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(2, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(3, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(4, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(5, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(6, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(7, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(8, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(9, AccT);
+    default:
+      break;
   }
 }
 
-template <typename T, int VPT, int WARP_PER_BLOCK>
-__global__ void VecSoftmaxBackward(T* dst, const T* grad, const T* src,
-                                   const int batch_size,
-                                   const int softmax_ele) {
-  const int offset =
-      blockIdx.x * softmax_ele * WARP_PER_BLOCK + threadIdx.x * VPT;
-
-  float local_sum_gy = 0.f;
-  vec_t<T, VPT> local_grad;
-  vec_t<T, VPT> local_src;
-
-  local_grad.s =
-      reinterpret_cast<const decltype(local_grad.s)*>(&grad[offset])[0];
-  local_src.s = reinterpret_cast<const decltype(local_src.s)*>(&src[offset])[0];
-
-  for (int i = 0; i < VPT; ++i) {
-    local_sum_gy += static_cast<float>(local_grad.v[i]) *
-                    static_cast<float>(local_src.v[i]);
-  }
-  float sum_gy = math::warpReduceSum<float>(local_sum_gy, 0xffffffff);
+#define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)                        \
+  case Log2Elements:                                                          \
+    WarpSoftmaxBackward<                                                      \
+        T, VecT, AccT, Log2Elements,                                          \
+        LogMode><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
+        dst, grad, src, batch_size, stride, element_count);                   \
+    break;
 
-  vec_t<T, VPT> local_dst;
-  for (int i = 0; i < VPT; ++i) {
-    local_dst.v[i] =
-        static_cast<T>(static_cast<float>(local_src.v[i]) *
-                       (static_cast<float>(local_grad.v[i]) - sum_gy));
+/*
+Wrapper of softmax backward with template instantiation on size of input.
+*/
+template <typename T, typename VecT, bool LogMode>
+void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads,
+                               const framework::ExecutionContext& ctx, T* dst,
+                               const T* grad, const T* src,
+                               const int batch_size, const int stride,
+                               const int element_count, int Log2Elements) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  switch (Log2Elements) {
+    SOFTMAX_WARP_BACKWARD_CASE(0, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(1, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(2, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(3, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(4, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(5, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(6, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(7, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(8, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(9, AccT);
+    default:
+      break;
   }
-  reinterpret_cast<decltype(local_dst.s)*>(&dst[offset])[0] = local_dst.s;
 }
 
-template <typename T>
+#undef SOFTMAX_WARP_FORWARD_CASE
+#undef SOFTMAX_WARP_BACKWARD_CASE
+
+template <typename T, bool LogMode = false>
 class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -335,60 +462,39 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
     const int D = SizeOutAxis(axis, dims);
 
     constexpr int max_dim = 320;
-    bool optimize = false;
     constexpr int warps_per_block = 4;
+
     if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-      if (dim == 128 && N % warps_per_block == 0) {
-        optimize = true;
-        // a warp for a batch, 4 elements for a thread, only support the softmax
-        // dim size = 128 currently
-        if (sizeof(T) == 2) {
-          VecSoftmaxForward<T, int2, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(out_data, x->data<T>(), N,
-                                                    dim);
-        } else if (sizeof(T) == 4) {
-          VecSoftmaxForward<T, int4, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(out_data, x->data<T>(), N,
-                                                    dim);
-        } else {
-          assert(false && "not support");
-        }
-      } else if (dim < max_dim) {
-        optimize = true;
-        int log2_elements = static_cast<int>(log2_ceil(dim));
-        const int next_power_of_two = 1 << log2_elements;
-
-        int warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;
-
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        // use 128 threads per block to maximimize gpu utilization
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        int blocks = (N + batches_per_block - 1) / batches_per_block;
-        dim3 threads(warp_size, warps_per_block, 1);
-
-        switch (log2_elements) {
-          LAUNCH_SOFTMAX_WARP_FORWARD(0);  // 1
-          LAUNCH_SOFTMAX_WARP_FORWARD(1);  // 2
-          LAUNCH_SOFTMAX_WARP_FORWARD(2);  // 4
-          LAUNCH_SOFTMAX_WARP_FORWARD(3);  // 8
-          LAUNCH_SOFTMAX_WARP_FORWARD(4);  // 16
-          LAUNCH_SOFTMAX_WARP_FORWARD(5);  // 32
-          LAUNCH_SOFTMAX_WARP_FORWARD(6);  // 64
-          LAUNCH_SOFTMAX_WARP_FORWARD(7);  // 128
-          LAUNCH_SOFTMAX_WARP_FORWARD(8);  // 256
-          LAUNCH_SOFTMAX_WARP_FORWARD(9);  // 512
-          default:
-            break;
-        }
+      const int kDimLog2 = static_cast<int>(log2_ceil(dim));
+      const int kDimCeil = 1 << kDimLog2;
+      int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+      int batches_per_warp = (kDimCeil <= 32) ? 2 : 1;
+
+      // use 128 threads per block to maximimize gpu utilization
+      constexpr int threads_per_block = 128;
+
+      int warps_per_block = (threads_per_block / kWarpSize);
+      int batches_per_block = warps_per_block * batches_per_warp;
+      int blocks = (N + batches_per_block - 1) / batches_per_block;
+      dim3 threads(kWarpSize, warps_per_block, 1);
+
+      // vectorization read/write
+      using T4 = typename VecT4<T>::Type;
+      using T2 = typename VecT2<T>::Type;
+      if (dim % 4 == 0) {
+        SwitchWarpSoftmaxForward<T, T4, LogMode>(blocks, threads, ctx, out_data,
+                                                 x->data<T>(), N, dim, dim,
+                                                 kDimLog2);
+      } else if (dim % 2 == 0) {
+        SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks, threads, ctx, out_data,
+                                                 x->data<T>(), N, dim, dim,
+                                                 kDimLog2);
+      } else {
+        SwitchWarpSoftmaxForward<T, T, LogMode>(blocks, threads, ctx, out_data,
+                                                x->data<T>(), N, dim, dim,
+                                                kDimLog2);
       }
-    }
-    if (!optimize) {
+    } else {
       ScopedTensorDescriptor desc;
       std::vector<int> tensor_dims = {N, dim, D, 1};
       DataLayout layout = DataLayout::kNCHW;
@@ -405,22 +511,37 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                    : MIOPEN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data,
+            MIOPEN_SOFTMAX_LOG, mode));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data,
+            MIOPEN_SOFTMAX_ACCURATE, mode));
+      }
 #else
       auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                    : CUDNN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+            handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
+            desc_, x->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            out_data));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+            handle, CUDNN_SOFTMAX_ACCURATE, mode,
+            platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      }
 #endif
     }
   }
 };
 
-template <typename T>
+template <typename T, bool LogMode = false>
 class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -437,78 +558,38 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     const int N = SizeToAxis(axis, dims);
     const int D = SizeOutAxis(axis, dims);
 
+    constexpr int max_dim = 320;
     constexpr int warps_per_block = 4;
-    constexpr bool warp_softmax_available =
-        std::is_same<T, float>::value ||
-        std::is_same<T, platform::float16>::value;
-    bool optimize = false;
-    if (D == 1 && warp_softmax_available) {
-      if (dim == 128 && N % warps_per_block == 0) {
-        optimize = true;
-        if (std::is_same<T, float>::value) {
-          VecSoftmaxBackward<float, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(dx->data<float>(),
-                                                    dout->data<float>(),
-                                                    out->data<float>(), N, dim);
-        } else if (std::is_same<T, platform::float16>::value) {
-          VecSoftmaxBackward<platform::float16, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(
-              dx->data<platform::float16>(), dout->data<platform::float16>(),
-              out->data<platform::float16>(), N, dim);
-        } else {
-          PADDLE_ENFORCE_EQ(
-              warp_softmax_available, true,
-              platform::errors::Unimplemented(
-                  "Warp softmax backward is only available for fp32 and fp16"));
-        }
-      } else if (dim < 40 && dim % 32 != 0) {
-        optimize = true;
-        Tensor mul_grad;
-        int numel = N * dim;
-        mul_grad.mutable_data<T>({numel}, ctx.GetPlace());
-
-        auto stream = ctx.cuda_device_context().stream();
-        auto& dev_ctx =
-            ctx.template device_context<platform::CUDADeviceContext>();
-        auto config = GetGpuLaunchConfig1D(dev_ctx, numel);
-
-        MultiplyCUDAKernel<T><<<config.block_per_grid.x,
-                                config.thread_per_block.x, 0, stream>>>(
-            mul_grad.data<T>(), dout->data<T>(), out->data<T>(), numel);
-
-        int log2_elements = log2_ceil(dim);
-        const int next_power_of_two = 1 << log2_elements;
-
-        int warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;
-
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        int blocks = (N + batches_per_block - 1) / batches_per_block;
-        dim3 threads(warp_size, warps_per_block, 1);
-
-        switch (log2_elements) {
-          LAUNCH_SOFTMAX_WARP_BACKWARD(0);  // 1
-          LAUNCH_SOFTMAX_WARP_BACKWARD(1);  // 2
-          LAUNCH_SOFTMAX_WARP_BACKWARD(2);  // 4
-          LAUNCH_SOFTMAX_WARP_BACKWARD(3);  // 8
-          LAUNCH_SOFTMAX_WARP_BACKWARD(4);  // 16
-          LAUNCH_SOFTMAX_WARP_BACKWARD(5);  // 32
-          LAUNCH_SOFTMAX_WARP_BACKWARD(6);  // 64
-          LAUNCH_SOFTMAX_WARP_BACKWARD(7);  // 128
-          LAUNCH_SOFTMAX_WARP_BACKWARD(8);  // 256
-          LAUNCH_SOFTMAX_WARP_BACKWARD(9);  // 512
-          default:
-            break;
-        }
+
+    if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
+      const int kDimLog2 = log2_ceil(dim);
+      const int kDimCeil = 1 << kDimLog2;
+      int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+      int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+      constexpr int threads_per_block = 128;
+
+      int warps_per_block = (threads_per_block / kWarpSize);
+      int batches_per_block = warps_per_block * batches_per_warp;
+      int blocks = (N + batches_per_block - 1) / batches_per_block;
+      dim3 threads(kWarpSize, warps_per_block, 1);
+
+      // vectorization read/write
+      using T4 = typename VecT4<T>::Type;
+      using T2 = typename VecT2<T>::Type;
+      if (dim % 4 == 0) {
+        SwitchWarpSoftmaxBackward<T, T4, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
+      } else if (dim % 2 == 0) {
+        SwitchWarpSoftmaxBackward<T, T2, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
+      } else {
+        SwitchWarpSoftmaxBackward<T, T, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
       }
-    }
-    if (!optimize) {
+    } else {
       ScopedTensorDescriptor desc;
       std::vector<int> tensor_dims = {N, dim, D, 1};
       DataLayout layout = DataLayout::kNCHW;
@@ -525,18 +606,32 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                    : MIOPEN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
-          desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
+            desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data, MIOPEN_SOFTMAX_LOG, mode));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
+            desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data, MIOPEN_SOFTMAX_ACCURATE, mode));
+      }
 #else
       auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                    : CUDNN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
-          dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+            handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
+            desc_, out->data<T>(), desc_, dout->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, dx_data));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+            handle, CUDNN_SOFTMAX_ACCURATE, mode,
+            platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
+            dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data));
+      }
 #endif
     }
   }
diff --git a/paddle/fluid/operators/softmax_impl.cuh b/paddle/fluid/operators/softmax_impl.cuh
new file mode 100755
index 00000000000000..2acc55d2398e99
--- /dev/null
+++ b/paddle/fluid/operators/softmax_impl.cuh
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int BatchSize, int WarpSize>
+__device__ __forceinline__ void WarpReduceSum(T* sum) {
+#pragma unroll
+  for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < BatchSize; ++i) {
+      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      sum[i] = sum[i] + sum_val;
+    }
+  }
+}
+
+template <typename T, int BatchSize, int WarpSize>
+__device__ __forceinline__ void WarpReduceMax(T* sum) {
+#pragma unroll
+  for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < BatchSize; ++i) {
+      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      sum[i] = max(sum[i], max_val);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index a21ef252c03f7c..5e7244f4390d84 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -83,11 +83,13 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     }
 #endif
 
+#ifndef PADDLE_WITH_ASCEND_CL
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
                         platform::errors::InvalidArgument(
                             "float16 can only be used on GPU place"));
     }
+#endif
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                    library_);
@@ -207,9 +209,10 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
     }
 #endif
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                        platform::errors::InvalidArgument(
-                            "float16 can only be used on GPU place"));
+      if (!(platform::is_gpu_place(ctx.GetPlace()) ||
+            platform::is_npu_place(ctx.GetPlace())))
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "float16 can only be used on GPU/NPU place"));
     }
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index a964c3b57a635b..08266318fb970b 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -45,6 +45,14 @@ static inline int SizeFromAxis(const int axis, DDim dims) {
   return size;
 }
 
+static inline int SizeOutAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
 template <typename DeviceContext, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
new file mode 100644
index 00000000000000..0e94f6af232f98
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_npu.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SoftmaxNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto axis = ctx.Attr<int>("axis");
+    std::vector<int> axes;
+    axes.push_back(axis);
+    framework::NPUAttributeMap attr_input = {{"axes", axes}};
+
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<framework::LoDTensor>("Out");
+    auto* dOut = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+
+    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto dims = dX->dims();
+    const int rank = dims.size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    int64_t first_dim = 1;
+    int64_t sec_dim = 1;
+    for (int i = 0; i < axis; i++) {
+      first_dim *= dims[i];
+    }
+    for (int i = axis; i < rank; i++) {
+      sec_dim *= dims[i];
+    }
+
+    Tensor tmp_out;
+    tmp_out.ShareDataWith(*out).Resize({first_dim, sec_dim});
+
+    Tensor tmp_dOut;
+    tmp_dOut.ShareDataWith(*dOut).Resize({first_dim, sec_dim});
+
+    dX->Resize(framework::make_ddim({first_dim, sec_dim}));
+    dX->mutable_data<T>(ctx.GetPlace());
+
+    framework::NPUAttributeMap attr_input = {};
+    auto runner = NpuOpRunner(std::string("SoftmaxGrad"), {tmp_out, tmp_dOut},
+                              {*dX}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+
+    dX->Resize(dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    softmax, ops::SoftmaxNPUKernel<plat::NPUDeviceContext, float>,
+    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, double>,
+    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, float>,
+    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, double>,
+    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext,
+                              paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
new file mode 100644
index 00000000000000..f06f59f3b4e005
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(softmax);
+USE_OP_DEVICE_KERNEL(softmax, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  for (int i = 3; i < 9; ++i) {
+    init.push_back(static_cast<T>(i));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({2, 3});
+  tensor_out->mutable_data<T>(place);  // allocate
+
+  // run
+  int axis = 1;
+  f::AttributeMap attrs = {
+      {"axis", axis},        {"use_cudnn", false},
+      {"use_mkldnn", false}, {"mkldnn_data_type", std::string("float32")},
+      {"is_test", false},
+  };
+
+  auto op = f::OpRegistry::CreateOp("softmax", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
+  }
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_init;
+
+  out_init.push_back(static_cast<T>(0.6670));
+  out_init.push_back(static_cast<T>(0.5888));
+  out_init.push_back(static_cast<T>(0.4543));
+  out_init.push_back(static_cast<T>(0.3330));
+  out_init.push_back(static_cast<T>(0.4112));
+  out_init.push_back(static_cast<T>(0.5457));
+
+  TensorFromVector(out_init, ctx, tensor_out);
+  tensor_out->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  std::vector<T> dout_init;
+  for (int i = 0; i < 6; ++i) {
+    dout_init.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(dout_init, ctx, tensor_dout);
+  tensor_dout->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs = {
+      {"name", std::string("softmax_grad")},
+      {"axis", static_cast<int>(0)},
+      {"use_cudnn", false},
+      {"use_mkldnn", false},
+      {"mkldnn_data_type", std::string("float32")},
+      {"is_test", false},
+      {"data_format", std::string("AnyLayout")},
+  };
+  auto op = f::OpRegistry::CreateOp("softmax_grad",
+                                    {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}},
+                                    {{"X@GRAD", {"DX"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_dx->dims()[0], (uint32_t)(2));
+  EXPECT_EQ((uint32_t)tensor_dx->dims()[1], (uint32_t)(3));
+
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_dx, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
+  EXPECT_NEAR((float)out_vec[0], (float)(-0.4737), 0.1);
+  EXPECT_NEAR((float)out_vec[1], (float)(-0.4181), 0.1);
+  EXPECT_NEAR((float)out_vec[2], (float)(-0.3226), 0.1);
+  EXPECT_NEAR((float)out_vec[3], (float)(-0.0965), 0.1);
+  EXPECT_NEAR((float)out_vec[4], (float)(-0.1192), 0.1);
+  EXPECT_NEAR((float)out_vec[5], (float)(-0.1582), 0.1);
+}
+
+TEST(softmax, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
+
+TEST(softmax_grad, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 2257d816d89218..140059256c3cc9 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -672,7 +672,11 @@ template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(
     const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
     int64_t n, int64_t d, int axis_dim, gpuStream_t stream) {
+#ifdef __HIPCC__
+  constexpr int kMaxBlockDim = 256;
+#else
   constexpr int kMaxBlockDim = 512;
+#endif
   int64_t block_dim = axis_dim >= kMaxBlockDim
                           ? kMaxBlockDim
                           : (1 << static_cast<int>(std::log2(axis_dim)));
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
new file mode 100644
index 00000000000000..c777a02f96bd9a
--- /dev/null
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -0,0 +1,203 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* logits = ctx.Input<Tensor>("Logits");
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* softmax = ctx.Output<Tensor>("Softmax");
+    auto* loss = ctx.Output<Tensor>("Loss");
+
+    int cls_num = logits->dims()[1];
+    const int rank = logits->dims().size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    std::vector<int> axes;
+    for (auto i = axis; i < logits->dims().size(); ++i) {
+      axes.push_back(i);
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // softmax
+    softmax->mutable_data<T>(ctx.GetPlace());
+    auto runner_softmax =
+        NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}});
+    runner_softmax.Run(stream);
+
+    // cast label from int64/int32 to int32
+    Tensor tmp_labels(framework::proto::VarType::INT32);
+    if (labels->type() != framework::proto::VarType::INT32) {
+      tmp_labels.Resize(labels->dims());
+      tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
+      auto runner_cast_label =
+          NpuOpRunner("Cast", {*labels}, {tmp_labels},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_label.Run(stream);
+      labels = &tmp_labels;
+    }
+
+    // on and off
+    Tensor on_tensor(framework::proto::VarType::INT32);
+    on_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<int>{static_cast<int>(1)},
+                     ctx.device_context(), &on_tensor);
+    Tensor off_tensor(framework::proto::VarType::INT32);
+    off_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<int>{static_cast<int>(0)},
+                     ctx.device_context(), &off_tensor);
+
+    // one_hot
+    Tensor tmp_onehot(on_tensor.type());
+    tmp_onehot.Resize(logits->dims());
+    tmp_onehot.mutable_data<int>(ctx.GetPlace());
+
+    auto runner_onehot =
+        NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
+                    {{"axis", -1}, {"depth", cls_num}});
+    runner_onehot.Run(stream);
+
+    // cast one_hot from int32 to T
+    Tensor cast_onehot(logits->type());
+    cast_onehot.Resize(tmp_onehot.dims());
+    cast_onehot.mutable_data<T>(ctx.GetPlace());
+    auto dst_dtype = ConvertToNpuDtype(logits->type());
+    auto runner_cast_onehot =
+        NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_onehot.Run(stream);
+
+    // SoftmaxCrossEntropyWithLogits
+    Tensor backprop(logits->type());
+    backprop.Resize(logits->dims());
+    backprop.mutable_data<T>(ctx.GetPlace());
+
+    loss->mutable_data<T>(ctx.GetPlace());
+
+    // SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size]
+    auto loss_dims = loss->dims();
+    loss->Resize({loss_dims[0]});
+    auto runner_s = NpuOpRunner("SoftmaxCrossEntropyWithLogits",
+                                {*logits, cast_onehot}, {*loss, backprop}, {});
+    runner_s.Run(stream);
+    loss->Resize(loss_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* softmax = ctx.Input<Tensor>("Softmax");
+    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* logits_grad = ctx.Output<Tensor>(framework::GradVarName("Logits"));
+
+    int cls_num = softmax->dims()[1];
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // cast label from int64/int32 to int32
+    Tensor tmp_labels(framework::proto::VarType::INT32);
+    if (labels->type() != framework::proto::VarType::INT32) {
+      tmp_labels.Resize(labels->dims());
+      tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
+      auto runner_cast_label =
+          NpuOpRunner("Cast", {*labels}, {tmp_labels},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_label.Run(stream);
+      labels = &tmp_labels;
+    }
+
+    // on and off
+    Tensor on_tensor(framework::proto::VarType::INT32);
+    on_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<int>{static_cast<int>(1)},
+                     ctx.device_context(), &on_tensor);
+    Tensor off_tensor(framework::proto::VarType::INT32);
+    off_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<int>{static_cast<int>(0)},
+                     ctx.device_context(), &off_tensor);
+
+    // one_hot
+    Tensor tmp_onehot(on_tensor.type());
+    tmp_onehot.Resize(softmax->dims());
+    tmp_onehot.mutable_data<int>(ctx.GetPlace());
+
+    auto runner_onehot =
+        NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
+                    {{"axis", -1}, {"depth", cls_num}});
+    runner_onehot.Run(stream);
+
+    // cast one_hot from int32 to T
+    Tensor cast_onehot(softmax->type());
+    cast_onehot.Resize(tmp_onehot.dims());
+    cast_onehot.mutable_data<T>(ctx.GetPlace());
+    auto dst_dtype = ConvertToNpuDtype(softmax->type());
+    auto runner_cast_onehot =
+        NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_onehot.Run(stream);
+
+    // sub
+    Tensor tmp_sub(softmax->type());
+    tmp_sub.Resize(softmax->dims());
+    tmp_sub.mutable_data<T>(ctx.GetPlace());
+    auto runner_sub =
+        NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {});
+
+    runner_sub.Run(stream);
+    // mul
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+    auto runner_mul =
+        NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {});
+    runner_mul.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    softmax_with_cross_entropy,
+    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
+                                          float>,
+    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
+                                          paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyGradNPUKernel<
+        paddle::platform::NPUDeviceContext, float>,
+    ops::SoftmaxWithCrossEntropyGradNPUKernel<
+        paddle::platform::NPUDeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
index 346ed965d06f28..8635def2ecf138 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -45,11 +45,25 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     const int n = SizeToAxis(axis, logits->dims());
     const int d = SizeFromAxis(axis, logits->dims());
     std::vector<int> logits_dims = framework::vectorize<int>(logits->dims());
+
     // softmax
     auto& dev_ctx =
         context.template device_context<platform::XPUDeviceContext>();
-    int r = xpu::softmax(dev_ctx.x_context(), logits->data<float>(),
-                         softmax->data<float>(), logits_dims, axis);
+    int r = XPU_SUCCESS;
+    Tensor clip_logits;
+    int len = logits->numel();
+    T* clip_logits_data =
+        clip_logits.mutable_data<T>(context.GetPlace(), len * sizeof(T));
+    r = xpu::clip(dev_ctx.x_context(), logits->data<float>(), clip_logits_data,
+                  len, -1e30, 1e30);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error. clip "
+                                   "execution not succeed, error code=%d",
+                                   r));
+
+    r = xpu::softmax(dev_ctx.x_context(), clip_logits_data,
+                     softmax->data<float>(), logits_dims, axis);
 
     PADDLE_ENFORCE_EQ(
         r, xpu::Error_t::SUCCESS,
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
deleted file mode 100644
index 54a35b5cd7df76..00000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/split_selected_rows_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input SelectedRows.");
-    AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-
-    AddComment(R"DOC(
-Split a SelectedRows with a specified rows section.
-height_sections is only needed when need to split the dims of the original tensor.
-
-Example:
-  Input:
-    X.rows = {7, 5}
-    X.height = 12
-  Attr:
-    height_sections = {4, 8}
-  Out:
-    out0.rows = {}
-    out0.height = 4
-
-    out1.rows = {5, 7}
-    out2.height = 8
-
-)DOC");
-  }
-};
-
-class SplitSelectedRowsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "SplitSelectedRowsOp must have input X."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "SplitSelectedRowsOp must have output Out."));
-  }
-};
-
-class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    ctx->SetOutputType("Out", framework::proto::VarType::SELECTED_ROWS,
-                       framework::ALL_ELEMENTS);
-  }
-};
-
-template <typename T>
-class SplitSelectedRowsGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("sum");
-    grad_op->SetInput("X", this->OutputGrad("Out"));
-    grad_op->SetOutput("Out", this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
-                  ops::SplitSelectedRowsOpMaker,
-                  ops::SplitSelectedRowsGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitSelectedRowsGradMaker<paddle::imperative::OpBase>,
-                  ops::SplitSelectedRowsOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
deleted file mode 100644
index 281f9fb7e596fc..00000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::SelectedRows>("X");
-    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-    auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
-
-    auto abs_sections = ToAbsoluteSection(height_sections);
-
-    auto& x_rows = x->rows();
-    auto height = x->height();
-    std::vector<std::vector<int>> outs_rows_idx;
-    std::vector<std::vector<int>> outs_dense_idx;
-
-    outs_rows_idx.resize(outs.size());
-    outs_dense_idx.resize(outs.size());
-
-    auto row_numel = x->value().numel() / x->value().dims()[0];
-    auto src = x->value().data<T>();
-
-    // split rows index into output sparse vars
-    for (size_t i = 0; i < x_rows.size(); ++i) {
-      auto& id = x_rows[i];
-      PADDLE_ENFORCE_LT(id, height,
-                        platform::errors::OutOfRange(
-                            "Each row_id in x.rows must be less than x.height. "
-                            "But received x.rows[%d] = %d, x.height = %d",
-                            i, id, height));
-      int out_idx = GetSectionIndex(id, abs_sections);
-      outs_rows_idx[out_idx].push_back(id);
-      outs_dense_idx[out_idx].push_back(i);
-    }
-    auto place = ctx.GetPlace();
-
-    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
-      auto rows_idx = outs_rows_idx[i];
-      outs[i]->set_height(height_sections[i]);
-      auto dims = x->GetCompleteDims();
-      dims[0] = rows_idx.size();
-      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
-      outs[i]->mutable_rows()->clear();
-      if (rows_idx.size() > 0) {
-        for (auto idx : rows_idx) {
-          auto id_offset = idx - abs_sections[i];
-          PADDLE_ENFORCE_LT(
-              id_offset, height_sections[i],
-              platform::errors::OutOfRange("Each row_id in out.rows must be "
-                                           "less than out.height. But recived "
-                                           "out.rows = [%d], out.height = [%d]",
-                                           id_offset, height_sections[i]));
-          outs[i]->mutable_rows()->push_back(id_offset);
-        }
-        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
-        for (size_t j = 0; j < rows_idx.size(); j++) {
-          if (platform::is_cpu_place(place)) {
-            memory::Copy(
-                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
-                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
-          } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-            auto stream = ctx.cuda_device_context().stream();
-            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(),
-                         src + outs_dense_idx[i][j] * row_numel,
-                         sizeof(T) * row_numel, stream);
-#else
-            PADDLE_THROW(platform::errors::Unavailable(
-                "Paddle is not compiled with CUDA. Cannot visit cuda device"));
-#endif
-          }
-        }
-      }
-      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
-                        platform::errors::InvalidArgument(
-                            "rows should has the same size with tensor dim 0. "
-                            "But received rows = %d, tensor's dim[0] = %d.",
-                            rows_idx.size(), outs[i]->rows().size()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc
new file mode 100644
index 00000000000000..33c9273e3b6f50
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op_npu.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/squeeze_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    squeeze, ops::SqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
+REGISTER_OP_NPU_KERNEL(
+    squeeze2, ops::SqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
new file mode 100644
index 00000000000000..22dc81cbd79e0e
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(squeeze);
+USE_OP_DEVICE_KERNEL(squeeze, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int dim0 = 1;
+  int dim1 = 10;
+  int dim2 = 1;
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < dim0 * dim1 * dim2; ++i) {
+    init.push_back(static_cast<T>(0.1));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({dim0, dim1, dim2});
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<int> axis;
+  axis.push_back(2);
+  f::AttributeMap attrs = {{"axes", axis}};
+
+  auto op = f::OpRegistry::CreateOp("squeeze", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1));
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
+  }
+
+  ctx.Wait();
+}
+
+TEST(squeeze, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index 38ab60afd91a41..03d5324528930c 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -30,7 +30,7 @@ struct StackGradFunctor {
     int i = idx / (n_ * post_);
     int which_x = idx / post_ - i * n_;
     int x_index = i * post_ + idx % post_;
-    dx_[which_x][x_index] = dy_[idx];
+    if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx];
   }
 
  private:
@@ -95,19 +95,21 @@ class StackGradKernel : public framework::OpKernel<T> {
     auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += dy->dims().size();
-
     int n = dy->dims()[axis];
     std::vector<T *> dx_datas(n);  // NOLINT
+
     for (int i = 0; i < n; i++) {
-      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
+      if (dx[i] == nullptr) {
+        dx_datas[i] = nullptr;
+      } else {
+        dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
+      }
     }
     auto dy_data = dy->data<T>();
-
     int pre = 1;
     for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
     int total_num = dy->numel();
     int post = total_num / (n * pre);
-
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto dx_data_arr = dx_datas.data();
     StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
new file mode 100644
index 00000000000000..958655b1f27c68
--- /dev/null
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/stack_op.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class StackNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    int32_t N = x.size();
+
+    PADDLE_ENFORCE_GT(
+        N, 0, platform::errors::InvalidArgument("number of input Tensor <= 0"));
+
+    std::vector<paddle::framework::Tensor> x_list;
+    for (int i = 0; i < N; i++) {
+      x_list.push_back(*x[i]);
+    }
+
+    int axis = ctx.Attr<int>("axis");
+
+    if (axis < 0) {
+      axis = axis + x_list[0].dims().size() + 1;
+    }
+    auto* out = ctx.Output<Tensor>("Y");
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    out->mutable_data<T>(place);
+
+    if (axis != 0) {
+      auto x_dim = x_list[0].dims();
+      std::vector<int> vec_dim_tmp;
+      vec_dim_tmp.push_back(N);
+      for (auto i = 0; i < x_dim.size(); ++i) {
+        vec_dim_tmp.push_back(x_dim[i]);
+      }
+
+      Tensor tmp_stack(out->type());
+      tmp_stack.Resize(framework::make_ddim(vec_dim_tmp));
+      tmp_stack.mutable_data<T>(ctx.GetPlace());
+
+      auto runner =
+          NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}});
+      runner.Run(stream);
+
+      std::vector<int64_t> vec_trans;
+      for (auto i = 1; i <= x_dim.size(); ++i) {
+        vec_trans.push_back(i);
+        if (i == axis) {
+          vec_trans.push_back(0);
+        }
+      }
+
+      auto runner_trans_final =
+          NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}});
+      runner_trans_final.Run(stream);
+
+    } else {
+      auto runner =
+          NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    stack, ops::StackNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::StackNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
new file mode 100644
index 00000000000000..e3dc5faf46c81e
--- /dev/null
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/sum_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SumNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto place = ctx.GetPlace();
+
+    int n = static_cast<int>(x.size());
+    PADDLE_ENFORCE_EQ(n > 1, true,
+                      platform::errors::InvalidArgument(
+                          "The size of Input(x) list must larger or equal 2"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
+
+    runner.Run(stream);
+    for (int i = 2; i < n; i++) {
+      runner = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    sum, ops::SumNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SumNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index d08a34ade77f28..69617b7e208a88 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -187,12 +187,6 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
           x_d, N, H * W * D, C, stats);
     }
 
-    Tensor c_g_st;
-    auto *c_g_st_d = c_g_st.mutable_data<BatchNormParamType<T>>(
-        {2 * C + 1}, platform::CPUPlace());
-    auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0);
-
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto *comm = dev_ctx.nccl_comm();
     if (comm) {
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 1f0ae40798e4db..1f3029d94b940f 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -76,6 +76,54 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
           model_input_shape_str, runtime_input_shape_str));
 }
 
+static void RuntimeDynamicShapeCheck(
+    const std::string &x, const std::vector<int64_t> &runtime_input_shape,
+    const std::vector<int> &min_input_shape,
+    const std::vector<int> &max_input_shape) {
+  PADDLE_ENFORCE_EQ(runtime_input_shape.size(), min_input_shape.size(),
+                    platform::errors::InvalidArgument(
+                        "TRT engine runtime input dims size(%d) inconsistent "
+                        "with the dynamic shape size(%d)",
+                        runtime_input_shape.size(), min_input_shape.size()));
+  auto is_input_shape_valid = [&](
+      const std::vector<int64_t> &runtime_input_shape,
+      const std::vector<int> &min_input_shape,
+      const std::vector<int> &max_input_shape) -> bool {
+    for (size_t i = 0; i < runtime_input_shape.size(); i++) {
+      if (runtime_input_shape[i] <= max_input_shape[i] &&
+          runtime_input_shape[i] >= min_input_shape[i]) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+    return true;
+  };
+  auto comma_fold = [](std::string a, int b) {
+    return std::move(a) + ", " + std::to_string(b);
+  };
+  std::string runtime_input_shape_str = std::accumulate(
+      std::next(runtime_input_shape.begin()), runtime_input_shape.end(),
+      std::to_string(runtime_input_shape[0]), comma_fold);
+  std::string min_input_shape_str =
+      std::accumulate(std::next(min_input_shape.begin()), min_input_shape.end(),
+                      std::to_string(min_input_shape[0]), comma_fold);
+  std::string max_input_shape_str =
+      std::accumulate(std::next(max_input_shape.begin()), max_input_shape.end(),
+                      std::to_string(max_input_shape[0]), comma_fold);
+  PADDLE_ENFORCE_EQ(is_input_shape_valid(runtime_input_shape, min_input_shape,
+                                         max_input_shape),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "TRT runtime input shape of %s is invalid. Expect "
+                        "runtime input shape to be within min/max input shape "
+                        "configured in SetTRTDynamicShapeInfo(),"
+                        "but got runtime input shape = [%s], min input shape = "
+                        "[%s], max input shape = [%s].",
+                        x, runtime_input_shape_str, min_input_shape_str,
+                        max_input_shape_str));
+}
+
 class TensorRTEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> input_names_;
@@ -272,6 +320,22 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
+        std::map<std::string, std::vector<int>> min_input_shape =
+            engine->min_input_shape();
+        std::map<std::string, std::vector<int>> max_input_shape =
+            engine->max_input_shape();
+        PADDLE_ENFORCE_EQ(
+            min_input_shape.count(x), true,
+            platform::errors::InvalidArgument(
+                "Input %s not found in TRT engine min_input_shape.", x));
+        PADDLE_ENFORCE_EQ(
+            max_input_shape.count(x), true,
+            platform::errors::InvalidArgument(
+                "Input %s not found in TRT engine max_input_shape.", x));
+        auto x_min_input_shape = min_input_shape[x];
+        auto x_max_input_shape = max_input_shape[x];
+        RuntimeDynamicShapeCheck(x, t_shape, x_min_input_shape,
+                                 x_max_input_shape);
         auto *trt_context = engine->context();
         trt_context->setBindingDimensions(
             bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
old mode 100644
new mode 100755
index dffd3e58641770..1fb0fa6ce5176f
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
@@ -26,9 +26,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define TILE_TEMPLATE(z, n, data) \
   case n + 1: {                   \
     Tile<n + 1>(context);         \
@@ -36,10 +44,10 @@ limitations under the License. */
   }
 #define REP_TILE_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define TILE_GRAD_CASE(n)                                        \
-  case n: {                                                      \
-    TileBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                       \
+#define TILE_GRAD_CASE(n)                                            \
+  case n + 1: {                                                      \
+    TileBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                           \
   }
 #define TILE_GRAD_TEMPLATE(z, n, data) BOOST_PP_IF(COND(n), TILE_GRAD_CASE(n), )
 #define REP_TILE_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_GRAD_TEMPLATE, ~)
@@ -155,7 +163,7 @@ class TileKernel : public framework::OpKernel<T> {
             "'repeat_times' for tile op must match after promotion.",
             vec_in_dims.size(), repeat_times.size()));
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -175,9 +183,11 @@ class TileKernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -240,7 +250,14 @@ class TileGradKernel : public framework::OpKernel<T> {
                             "must be less than or equal "
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
-      switch (dims) { REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+      switch (dims) {
+        REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
+      }
     }
   }
 
@@ -255,21 +272,20 @@ class TileGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
 
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
new file mode 100644
index 00000000000000..684bd476b6ef21
--- /dev/null
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/top_k_op.h"
+
+namespace paddle {
+namespace operators {
+
+void gen_assist_seq(framework::Tensor* assit_tensor, int64_t dim,
+                    const framework::ExecutionContext& ctx) {
+  const int64_t dimx2 = dim;
+  std::vector<paddle::platform::float16> assit;
+  assit.resize(2 * dimx2);
+  for (int64_t i = 0; i < dimx2; i++) {
+    // for i in range [0, dim]
+    assit[i] = static_cast<paddle::platform::float16>(i);
+
+    // for i in range [dim, dimx2]
+    int64_t idx =
+        static_cast<int64_t>(static_cast<paddle::platform::float16>(i));
+    int64_t gap = i - idx;
+    assit[i + dim] = static_cast<paddle::platform::float16>(gap);
+  }
+  framework::TensorFromVector(assit, ctx.device_context(), assit_tensor);
+}
+
+template <typename DeviceContext, typename T>
+class TopkNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // read input
+    auto* input = ctx.Input<framework::LoDTensor>("X");
+    auto* output = ctx.Output<framework::LoDTensor>("Out");
+    auto* indices = ctx.Output<framework::LoDTensor>("Indices");
+
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    output->mutable_data<T>(ctx.GetPlace());
+    indices->mutable_data<int>(ctx.GetPlace());
+
+    // prepare assit
+    auto dim = input->dims().size();
+    framework::Tensor assist_seq_tensor;
+    assist_seq_tensor.Resize({2 * dim});
+    assist_seq_tensor.mutable_data<T>(ctx.GetPlace());
+    gen_assist_seq(&assist_seq_tensor, dim, ctx);
+
+    framework::NPUAttributeMap attr_input = {{"sorted", "true"},
+                                             {"k", static_cast<int>(k)},
+                                             {"dim", -1},
+                                             {"largest", true}};
+
+    // run ascend
+    auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
+                              {*output, *indices}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+// Ascend Op TopKD only support input float 16 dtype
+REGISTER_OP_NPU_KERNEL(top_k,
+                       ops::TopkNPUKernel<paddle::platform::NPUDeviceContext,
+                                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
new file mode 100644
index 00000000000000..994b8e534f85e2
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class TransposeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    framework::NPUAttributeMap attr_input = {{"perm", axis}};
+    out->mutable_data<T>(ctx.device_context().GetPlace());
+    auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class TransposeGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
+    auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    transpose2,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>);
+
+REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel<float>,
+                       ops::TransposeGradNPUKernel<paddle::platform::float16>,
+                       ops::TransposeGradNPUKernel<int>,
+                       ops::TransposeGradNPUKernel<uint8_t>,
+                       ops::TransposeGradNPUKernel<int8_t>);
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
new file mode 100644
index 00000000000000..36f7a695358511
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -0,0 +1,137 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <cmath>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(transpose2);
+USE_OP_DEVICE_KERNEL(transpose2, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto out = scope->Var("Out");
+  auto xshape = scope->Var("XShape");
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
+  auto place = ctx.GetPlace();
+
+  int dim0 = 2;
+  int dim1 = 3;
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
+  ctx.Wait();
+  x_t->Resize({dim0, dim1});
+  out_t->Resize({dim0, dim1});
+  ctx.Wait();
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  xshape_t->Resize({dim0, dim1});
+  xshape_t->mutable_data<T>(place);
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+  auto op = f::OpRegistry::CreateOp("transpose2", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}, {"XShape", {"XShape"}}},
+                                    attrs);
+  ctx.Wait();
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::vector<T> out_v;
+  TensorToVector(*out_t, ctx, &out_v);
+  ctx.Wait();
+
+  EXPECT_EQ(out_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto xshape = scope->Var("XShape");
+  auto x_grad = scope->Var("X@GRAD");
+  auto out_grad = scope->Var("Out@GRAD");
+
+  auto* x_grad_t = x_grad->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
+  auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
+
+  int dim0 = 2;
+  int dim1 = 3;
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
+  ctx.Wait();
+
+  x_grad_t->Resize({dim0, dim1});
+  xshape_t->Resize(
+      {0, dim0,
+       dim1});  // NOTE(zhiqiu): 0 is needed, see its infershape function
+  out_grad_t->Resize({dim0, dim1});
+
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+
+  auto op = f::OpRegistry::CreateOp(
+      "transpose2_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}},
+      {{"X@GRAD", {"X@GRAD"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::vector<T> out_v;
+  TensorToVector(*x_grad_t, ctx, &out_v);
+  ctx.Wait();
+
+  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+}
+
+TEST(transpose2, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
+
+TEST(transpose2_grad, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
new file mode 100644
index 00000000000000..4253187fdde74d
--- /dev/null
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal
+    std::vector<int> shape = ctx.Attr<std::vector<int>>("shape");
+    Tensor shape_tensor(framework::proto::VarType::INT32);
+    shape_tensor.mutable_data<int32_t>({static_cast<int>(shape.size())},
+                                       ctx.GetPlace());
+    TensorFromVector(shape, ctx.device_context(), &shape_tensor);
+    float mean = ctx.Attr<float>("mean");
+    Tensor mean_tensor(framework::proto::VarType::FP32);
+    mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<float>{mean}, ctx.device_context(),
+                     &mean_tensor);
+
+    float std = ctx.Attr<float>("std");
+    Tensor std_tensor(framework::proto::VarType::FP32);
+    std_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    TensorFromVector(std::vector<float>{std}, ctx.device_context(),
+                     &std_tensor);
+
+    int32_t seed_var = ctx.Attr<int32_t>("seed");
+
+    Tensor min_tensor(framework::proto::VarType::FP32);
+    min_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    float min_value = mean - std * 2.0;
+    TensorFromVector(std::vector<float>{min_value}, ctx.device_context(),
+                     &min_tensor);
+
+    Tensor max_tensor(framework::proto::VarType::FP32);
+    max_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    float max_value = mean + std * 2.0;
+    TensorFromVector(std::vector<float>{max_value}, ctx.device_context(),
+                     &max_tensor);
+
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner(
+        "ParameterizedTruncatedNormal",
+        {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out},
+        {{"seed", seed_var}});
+    runner.Run(stream);
+  }
+};
+
+// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the
+// above
+// npu version work in the future.
+template <typename T>
+class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    tensor->mutable_data<T>(context.GetPlace());
+
+    Tensor cpu_tensor(tensor->type());
+    cpu_tensor.Resize(tensor->dims());
+    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
+    TruncatedNormal<T> truncated_normal(mean, std);
+    int64_t size = tensor->numel();
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
+    for (int64_t i = 0; i < size; ++i) {
+      cpu_data[i] = truncated_normal(dist(*engine));
+    }
+    framework::TensorCopy(
+        cpu_tensor, context.GetPlace(),
+        context.template device_context<platform::DeviceContext>(), tensor);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(truncated_gaussian_random,
+                       ops::NPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc
new file mode 100644
index 00000000000000..c3daeffc13d1a7
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op_npu.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    unsqueeze, ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
+REGISTER_OP_NPU_KERNEL(
+    unsqueeze2, ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
new file mode 100644
index 00000000000000..9b4485047f05c1
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(unsqueeze);
+USE_OP_DEVICE_KERNEL(unsqueeze, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int dim0 = 5;
+  int dim1 = 10;
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < dim0 * dim1; ++i) {
+    init.push_back(static_cast<T>(0.1));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({dim0, dim1});
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<int> axis;
+  axis.push_back(1);
+  f::AttributeMap attrs = {{"axes", axis}};
+
+  auto op = f::OpRegistry::CreateOp("unsqueeze", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(3));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(5));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(1));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10));
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
+  }
+
+  ctx.Wait();
+}
+
+TEST(unsqueeze, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc
index 2f71f10a1c4177..71cc586cb598fd 100644
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -101,14 +101,18 @@ class UnStackGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0,
                       platform::errors::InvalidArgument(
-                          "Number of Inputs(Y@Grad) must be larger than 0"));
+                          "The Inputs(Y@Grad) of unstack operator are empty."));
     OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", "X",
                    "UnStackGrad");
     auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
     for (size_t i = 1; i < input_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Dims of all Inputs(Y@Grad) must be the same"));
+      PADDLE_ENFORCE_EQ(
+          input_dims[i], input_dims[0],
+          platform::errors::InvalidArgument(
+              "The dimensions of all Inputs(Y@Grad) must be the same,"
+              "but received Inputs(Y@Grad)'s %d-th dimension is %d, "
+              "Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
+              i, input_dims[i], i - 1, input_dims[0]));
     }
 
     int axis = ctx->Attrs().Get<int>("axis");
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index 985c35127617bf..912d538d5e9513 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -108,5 +108,18 @@ inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
   return framework::make_ddim(vec_shape);
 }
 
+template <typename T>
+inline T GetValue(const framework::Tensor* x) {
+  T value = static_cast<T>(0);
+  if (!platform::is_cpu_place(x->place())) {
+    framework::Tensor cpu_x;
+    framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
+    value = cpu_x.data<T>()[0];
+  } else {
+    value = x->data<T>()[0];
+  }
+  return value;
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 7451cac63d0cea..e90eefd72d4ce2 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -159,8 +159,7 @@ class WarpCTCFunctor {
     warpctc_version_ = platform::dynload::get_warpctc_version();
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-// HIP not support ctcOptions in third-party warpctc
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = CTC_GPU;
       options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                             ctx.device_context())
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 47344f0e3733d6..584dbd4756aa09 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -10,6 +10,12 @@ ELSE()
   set(XPU_CTX_DEPS)
 endif(WITH_XPU)
 
+if(WITH_ASCEND)
+    set(ASCEND_DEPS xpulib)
+ELSE()
+  set(ASCEND_DEPS)
+endif(WITH_ASCEND)
+
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
   add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -66,6 +72,14 @@ if(WITH_XPU)
 cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
 endif()
 
+if(WITH_ASCEND)
+    cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
+endif()
+
+if(WITH_ASCEND_CL)
+    cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor ascendcl acl_op_compiler)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
@@ -81,11 +95,20 @@ IF(WITH_GPU OR WITH_ROCM)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()
 
+IF(WITH_ASCEND_CL)
+    set(NPU_CTX_DEPS npu_stream npu_info)
+ENDIF()
+
 IF(WITH_MKLDNN)
     set(MKLDNN_CTX_DEPS mkldnn)
 ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
+
+IF(WITH_ASCEND_CL)
+cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+ENDIF()
+
 IF(WITH_GPU)
     nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
 ENDIF()
@@ -95,6 +118,8 @@ ENDIF()
 
 IF(WITH_GPU OR WITH_ROCM)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
+ELSEIF(WITH_ASCEND_CL)
+  set(STREAM_CALLBACK_DEPS stream_callback_manager)
 ELSE()
   set(STREAM_CALLBACK_DEPS)
 ENDIF()
@@ -108,7 +133,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
diff --git a/paddle/fluid/platform/ascend_npu_info.cc b/paddle/fluid/platform/ascend_npu_info.cc
new file mode 100644
index 00000000000000..db8dafeae1e893
--- /dev/null
+++ b/paddle/fluid/platform/ascend_npu_info.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/ascend_npu_info.h"
+#include <glog/logging.h>
+#include "acl/acl_rt.h"
+
+namespace paddle {
+namespace platform {
+namespace ascend {
+
+int NPUDevice::GetDeviceCount() {
+  uint32_t count = 0;
+  aclError status = aclrtGetDeviceCount(&count);
+  if (status != 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "aclrtGetDeviceCount error code: %d", status));
+    return -1;
+  }
+
+  return count;
+}
+
+}  // namespace ascend
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_selected_rows_op.cu b/paddle/fluid/platform/ascend_npu_info.h
similarity index 59%
rename from paddle/fluid/operators/split_selected_rows_op.cu
rename to paddle/fluid/platform/ascend_npu_info.h
index 7250917036f611..7afed121a5acb6 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cu
+++ b/paddle/fluid/platform/ascend_npu_info.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,9 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
 
-#include "paddle/fluid/operators/split_selected_rows_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
+#ifdef PADDLE_WITH_ASCEND
+
+namespace paddle {
+namespace platform {
+namespace ascend {
+
+class NPUDevice {
+ public:
+  //! Get the total number of XPU devices in system.
+  static int GetDeviceCount();
+};
+
+}  // namespace ascend
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h
index d6fddd672a0f89..da2f83c3497cce 100644
--- a/paddle/fluid/platform/complex128.h
+++ b/paddle/fluid/platform/complex128.h
@@ -47,6 +47,10 @@
 #define HOST
 #endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX128
+#endif
+
 namespace paddle {
 namespace platform {
 
@@ -217,7 +221,8 @@ struct PADDLE_ALIGN(16) complex128 {
 
 HOSTDEVICE inline complex128 operator+(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) +
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -227,7 +232,8 @@ HOSTDEVICE inline complex128 operator+(const complex128& a,
 
 HOSTDEVICE inline complex128 operator-(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) -
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -237,7 +243,8 @@ HOSTDEVICE inline complex128 operator-(const complex128& a,
 
 HOSTDEVICE inline complex128 operator*(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) *
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -248,7 +255,8 @@ HOSTDEVICE inline complex128 operator*(const complex128& a,
 
 HOSTDEVICE inline complex128 operator/(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) /
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -259,7 +267,8 @@ HOSTDEVICE inline complex128 operator/(const complex128& a,
 }
 
 HOSTDEVICE inline complex128 operator-(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(-thrust::complex<double>(a.real, a.imag));
 #else
   complex128 res;
@@ -271,7 +280,8 @@ HOSTDEVICE inline complex128 operator-(const complex128& a) {
 
 HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) +=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -284,7 +294,8 @@ HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) -=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -297,7 +308,8 @@ HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) *=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -310,7 +322,8 @@ HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator/=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) /=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -353,7 +366,7 @@ HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) {
 }
 
 HOSTDEVICE inline bool(isnan)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isnanf not supported on HIP platform
   return __isnan(a.real) || __isnan(a.imag);
 #else
@@ -362,7 +375,7 @@ HOSTDEVICE inline bool(isnan)(const complex128& a) {
 }
 
 HOSTDEVICE inline bool(isinf)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isinf not supported on HIP platform
   return __isinf(a.real) || __isinf(a.imag);
 #else
@@ -375,7 +388,8 @@ HOSTDEVICE inline bool(isfinite)(const complex128& a) {
 }
 
 HOSTDEVICE inline double(abs)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return thrust::abs(thrust::complex<double>(a.real, a.imag));
 #else
   return std::abs(std::complex<double>(a.real, a.imag));
@@ -383,7 +397,8 @@ HOSTDEVICE inline double(abs)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag),
                                 thrust::complex<double>(b.real, b.imag)));
 #else
@@ -392,7 +407,8 @@ HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
 }
 
 HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag)));
 #else
   return std::sqrt(std::complex<double>(a));
@@ -400,7 +416,8 @@ HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(tanh)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag)));
 #else
   return std::tanh(std::complex<double>(a));
@@ -408,7 +425,8 @@ HOSTDEVICE inline complex128(tanh)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(log)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::log(thrust::complex<double>(a.real, a.imag)));
 #else
   return complex128(std::log(std::complex<double>(a)));
diff --git a/paddle/fluid/platform/complex64.h b/paddle/fluid/platform/complex64.h
index 9d55ba19105a61..0aad7bd9dd2a8f 100644
--- a/paddle/fluid/platform/complex64.h
+++ b/paddle/fluid/platform/complex64.h
@@ -47,6 +47,10 @@
 #define HOST
 #endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX64
+#endif
+
 #include "complex128.h"  // NOLINT
 
 namespace paddle {
@@ -224,7 +228,8 @@ struct PADDLE_ALIGN(8) complex64 {
 };
 
 HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) +
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -233,7 +238,8 @@ HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) -
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -242,7 +248,8 @@ HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) *
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -252,7 +259,8 @@ HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) /
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -263,7 +271,8 @@ HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator-(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(-thrust::complex<float>(a.real, a.imag));
 #else
   complex64 res;
@@ -275,7 +284,8 @@ HOSTDEVICE inline complex64 operator-(const complex64& a) {
 
 HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) +=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -288,7 +298,8 @@ HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) -=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -301,7 +312,8 @@ HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) *=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -314,7 +326,8 @@ HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator/=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) /=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -357,7 +370,7 @@ HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline bool(isnan)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isnanf not supported on HIP platform
   return __isnanf(a.real) || __isnanf(a.imag);
 #else
@@ -366,7 +379,7 @@ HOSTDEVICE inline bool(isnan)(const complex64& a) {
 }
 
 HOSTDEVICE inline bool(isinf)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isinff not supported on HIP platform
   return __isinff(a.real) || __isinff(a.imag);
 #else
@@ -379,7 +392,8 @@ HOSTDEVICE inline bool(isfinite)(const complex64& a) {
 }
 
 HOSTDEVICE inline float(abs)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::abs(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::abs(std::complex<float>(a.real, a.imag));
@@ -387,7 +401,8 @@ HOSTDEVICE inline float(abs)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::pow(thrust::complex<float>(a.real, a.imag),
                                thrust::complex<float>(b.real, b.imag)));
 #else
@@ -396,7 +411,8 @@ HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::sqrt(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::sqrt(std::complex<float>(a));
@@ -404,7 +420,8 @@ HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(tanh)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::tanh(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::tanh(std::complex<float>(a));
@@ -412,7 +429,8 @@ HOSTDEVICE inline complex64(tanh)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(log)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::log(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::log(std::complex<float>(a));
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 4f504b414de4a7..dde9531e591442 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -26,14 +26,10 @@ namespace platform {
 #ifdef PADDLE_WITH_HIP
 #define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
 #else
-#if CUDA_VERSION < 9000
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
-#endif
 
 inline static int RoundToPowerOfTwo(int dim) {
   if (dim > 512) {
@@ -69,7 +65,7 @@ template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta,
                                                  int width = warpSize) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl_down(val, delta, width);
 #else
   return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
@@ -79,7 +75,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 template <typename T>
 __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
                                                 int width = warpSize) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl_xor(val, width);
 #else
   return __shfl_xor_sync(mask, val, width);
@@ -87,7 +83,7 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
 }
 
 // CUDA 9.0 have native compatible float16 shfl_down
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
@@ -170,7 +166,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl(val, src_line, width);
 #else
   return __shfl_sync(mask, val, src_line, width);
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index fa4ef3f8c124e4..202be920c55953 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -25,10 +25,6 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
-enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
-#endif
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 340372007a77b0..94f64d158afbcb 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -200,6 +200,8 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
 
     old = atomicCAS(address_as_i, assumed, __float_as_int(val));
   } while (assumed != old);
+
+  return __int_as_float(old);
 }
 
 CUDA_ATOMIC_WRAPPER(Max, double) {
@@ -219,6 +221,8 @@ CUDA_ATOMIC_WRAPPER(Max, double) {
 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
   } while (assumed != old);
+
+  return __longlong_as_double(old);
 }
 
 // For atomicMin
@@ -272,6 +276,8 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
 
     old = atomicCAS(address_as_i, assumed, __float_as_int(val));
   } while (assumed != old);
+
+  return __int_as_float(old);
 }
 
 CUDA_ATOMIC_WRAPPER(Min, double) {
@@ -291,6 +297,8 @@ CUDA_ATOMIC_WRAPPER(Min, double) {
 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
   } while (assumed != old);
+
+  return __longlong_as_double(old);
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 22daaf101cf200..a0ade3898c336b 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -78,13 +78,13 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
+  VLOG(4) << "DeviceContextPool Get: " << place;
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU or WITH_XPU option or check that your train process "
-        "hold the "
-        "correct gpu_id if you use Executor.",
+        "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
+        "your train process set the correct device id if you use Executor.",
         place));
   }
   return it->second.get().get();
@@ -145,6 +145,14 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(
           platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                           "re-compile with WITH_XPU option."));
+#endif
+    } else if (platform::is_npu_place(p)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      EmplaceDeviceContext<NPUDeviceContext, NPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPlace is not supported. Please "
+          "re-compile with WITH_ASCEND_CL option."));
 #endif
     }
   }
@@ -229,8 +237,35 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
 xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
+  NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
+  // NOTE(zhiqiu): Usually, no need to create context explicitly,
+  // ACL creates a default context which contains 1 default stream
+  // and 1 sync strean after aclrtSetDevice.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
+  stream_.reset(new stream::NPUStream(place));
+}
+
+NPUDeviceContext::~NPUDeviceContext() {
+  // NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
+}
 
+void NPUDeviceContext::Wait() const {
+  NPUDeviceGuard guard(place_.device);
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
+}
+
+aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
+
+Place NPUDeviceContext::GetPlace() const { return place_; }
+
+aclrtContext NPUDeviceContext::context() const { return context_; }
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
  public:
   EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -706,6 +741,5 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 }
 
 #endif
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 411fe09c864aa2..2578c9b6cdea5a 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -57,6 +57,9 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/stream/npu_stream.h"
+#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
@@ -69,6 +72,11 @@ struct GpuDevice;
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#include "paddle/fluid/platform/npu_info.h"
+#endif
+
 namespace paddle {
 namespace platform {
 
@@ -77,6 +85,7 @@ namespace platform {
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
 bool AllowTF32Cublas();
+extern bool allow_tf32_cudnn;
 /*Set the value of the global variable allow_tf32_cudnn*/
 void SetAllowTF32Cudnn(bool active);
 /*Get the global variable allow_tf32_cudnn value*/
@@ -87,11 +96,13 @@ enum DeviceType {
   CPU = 0,
   CUDA = 1,
   XPU = 2,
+  NPU = 3,
 };
 
 constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kNPU = DeviceType::NPU;
 
 class DeviceContext {
  public:
@@ -163,8 +174,52 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDeviceContext : public DeviceContext {
+ public:
+  explicit NPUDeviceContext(NPUPlace place);
+  virtual ~NPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  aclrtContext context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+  /*! \brief  Return npu stream in the device context. */
+  aclrtStream stream() const;
+
+#ifdef PADDLE_WITH_ASCEND_HCCL
+  /*! \brief  Return bkcl context. */
+  HCCLContext_t hccl_context() const { return hccl_context_; }
+
+  /*! \brief  Set bkcl context. */
+  void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
+#endif
 
+ private:
+  NPUPlace place_;
+  aclrtContext context_;
+#ifdef PADDLE_WITH_ASCEND_HCCL
+  HCCLContext_t hccl_context_;
+#endif
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in NPU
+  // NOTE(zhiqiu): why need?
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  std::shared_ptr<stream::NPUStream> stream_;
+
+  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::NPUPlace> {
+  using TYPE = NPUDeviceContext;
+};
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class CudnnWorkspaceHandle;
 class EigenCudaStreamDevice;
 
@@ -600,6 +655,8 @@ class MKLDNNDeviceContextThreadLocals {
     // MKL-DNN stream used for execution of primitives (per-thread)
     mkldnn::engine cur_engine;
     mkldnn::stream cur_stream;
+    std::string key_suffix;  // Key identifying current Executor
+    bool key_attach_thread_id = true;
 
     Body();
     ~Body();
@@ -612,6 +669,10 @@ class MKLDNNDeviceContextThreadLocals {
     void log_lib_version(void);
     const mkldnn::engine& get_engine(void);
     mkldnn::stream& get_stream(void);
+    void set_key_suffix(const std::string& suffix) { key_suffix = suffix; }
+    const std::string& get_key_suffix(void) const { return key_suffix; }
+    void disable_tid_in_key(void) { key_attach_thread_id = false; }
+    bool is_tid_used_in_key(void) const { return key_attach_thread_id; }
   };
   MKLDNNDeviceContextThreadLocals() = default;
   MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) =
@@ -655,14 +716,6 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   // Remove all entries from the blob map
   void ResetBlobMap();
 
-  // Set a suffix to be added to key
-  void SetKeySuffix(const std::string& suffix) { key_suffix_ = suffix; }
-  const std::string& GetKeySuffix(void) const { return key_suffix_; }
-
-  // Disable adding  thread ID to the key
-  void DisableThreadInfoInKey(void) { key_attach_thread_id_ = false; }
-  bool IsThreadIdUsedInKey(void) const { return key_attach_thread_id_; }
-
   // Prevent next ResetBlobMap()
   void BlockNextCacheClearing();
 
@@ -686,8 +739,6 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   std::shared_ptr<BlobMap> p_blobmap_;
   std::shared_ptr<std::mutex> p_mutex_;
   bool block_next_cache_clearing_ = false;
-  std::string key_suffix_;  // Key identifying current Executor
-  bool key_attach_thread_id_ = true;
 };
 #endif
 
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 4c59fe5e9bae4b..366762401c741e 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -24,26 +24,9 @@ void* cudnn_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
-CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
 
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R5
-CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R6
-CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
 #endif
 
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index db84b8731f9ca4..4828a97e4df4d5 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
@@ -48,121 +49,93 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor);                    \
-  __macro(cudnnSetTensor4dDescriptorEx);                  \
-  __macro(cudnnSetTensorNdDescriptor);                    \
-  __macro(cudnnGetTensorNdDescriptor);                    \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);         \
-  __macro(cudnnCreateTensorDescriptor);                   \
-  __macro(cudnnDestroyTensorDescriptor);                  \
-  __macro(cudnnCreateFilterDescriptor);                   \
-  __macro(cudnnSetFilter4dDescriptor);                    \
-  __macro(cudnnSetFilterNdDescriptor);                    \
-  __macro(cudnnGetFilterNdDescriptor);                    \
-  __macro(cudnnSetPooling2dDescriptor);                   \
-  __macro(cudnnSetPoolingNdDescriptor);                   \
-  __macro(cudnnGetPoolingNdDescriptor);                   \
-  __macro(cudnnDestroyFilterDescriptor);                  \
-  __macro(cudnnCreateConvolutionDescriptor);              \
-  __macro(cudnnCreatePoolingDescriptor);                  \
-  __macro(cudnnDestroyPoolingDescriptor);                 \
-  __macro(cudnnSetConvolution2dDescriptor);               \
-  __macro(cudnnDestroyConvolutionDescriptor);             \
-  __macro(cudnnSetConvolutionNdDescriptor);               \
-  __macro(cudnnGetConvolutionNdDescriptor);               \
-  __macro(cudnnDeriveBNTensorDescriptor);                 \
-  __macro(cudnnCreateSpatialTransformerDescriptor);       \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);        \
-  __macro(cudnnDestroySpatialTransformerDescriptor);      \
-  __macro(cudnnSpatialTfGridGeneratorForward);            \
-  __macro(cudnnSpatialTfGridGeneratorBackward);           \
-  __macro(cudnnSpatialTfSamplerForward);                  \
-  __macro(cudnnSpatialTfSamplerBackward);                 \
-  __macro(cudnnCreate);                                   \
-  __macro(cudnnDestroy);                                  \
-  __macro(cudnnSetStream);                                \
-  __macro(cudnnActivationForward);                        \
-  __macro(cudnnActivationBackward);                       \
-  __macro(cudnnConvolutionForward);                       \
-  __macro(cudnnConvolutionBackwardBias);                  \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);       \
-  __macro(cudnnTransformTensor);                          \
-  __macro(cudnnPoolingForward);                           \
-  __macro(cudnnPoolingBackward);                          \
-  __macro(cudnnSoftmaxBackward);                          \
-  __macro(cudnnSoftmaxForward);                           \
-  __macro(cudnnGetVersion);                               \
-  __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);   \
-  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
-  __macro(cudnnGetErrorString);                           \
-  __macro(cudnnCreateDropoutDescriptor);                  \
-  __macro(cudnnDropoutGetStatesSize);                     \
-  __macro(cudnnSetDropoutDescriptor);                     \
-  __macro(cudnnRestoreDropoutDescriptor);                 \
-  __macro(cudnnCreateRNNDescriptor);                      \
-  __macro(cudnnGetRNNParamsSize);                         \
-  __macro(cudnnGetRNNWorkspaceSize);                      \
-  __macro(cudnnGetRNNTrainingReserveSize);                \
-  __macro(cudnnRNNForwardTraining);                       \
-  __macro(cudnnRNNBackwardData);                          \
-  __macro(cudnnRNNBackwardWeights);                       \
-  __macro(cudnnRNNForwardInference);                      \
-  __macro(cudnnDestroyDropoutDescriptor);                 \
-  __macro(cudnnDestroyRNNDescriptor);                     \
-  __macro(cudnnSetTensorNdDescriptorEx);
-
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
-  __macro(cudnnAddTensor);                 \
-  __macro(cudnnConvolutionBackwardData);   \
-  __macro(cudnnConvolutionBackwardFilter);
-CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
+  __macro(cudnnSetTensor4dDescriptor);                     \
+  __macro(cudnnSetTensor4dDescriptorEx);                   \
+  __macro(cudnnSetTensorNdDescriptor);                     \
+  __macro(cudnnGetTensorNdDescriptor);                     \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
+  __macro(cudnnCreateTensorDescriptor);                    \
+  __macro(cudnnDestroyTensorDescriptor);                   \
+  __macro(cudnnCreateFilterDescriptor);                    \
+  __macro(cudnnSetFilter4dDescriptor);                     \
+  __macro(cudnnSetFilterNdDescriptor);                     \
+  __macro(cudnnGetFilterNdDescriptor);                     \
+  __macro(cudnnSetPooling2dDescriptor);                    \
+  __macro(cudnnSetPoolingNdDescriptor);                    \
+  __macro(cudnnGetPoolingNdDescriptor);                    \
+  __macro(cudnnDestroyFilterDescriptor);                   \
+  __macro(cudnnCreateConvolutionDescriptor);               \
+  __macro(cudnnCreatePoolingDescriptor);                   \
+  __macro(cudnnDestroyPoolingDescriptor);                  \
+  __macro(cudnnSetConvolution2dDescriptor);                \
+  __macro(cudnnDestroyConvolutionDescriptor);              \
+  __macro(cudnnSetConvolutionNdDescriptor);                \
+  __macro(cudnnGetConvolutionNdDescriptor);                \
+  __macro(cudnnDeriveBNTensorDescriptor);                  \
+  __macro(cudnnCreateSpatialTransformerDescriptor);        \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
+  __macro(cudnnDestroySpatialTransformerDescriptor);       \
+  __macro(cudnnSpatialTfGridGeneratorForward);             \
+  __macro(cudnnSpatialTfGridGeneratorBackward);            \
+  __macro(cudnnSpatialTfSamplerForward);                   \
+  __macro(cudnnSpatialTfSamplerBackward);                  \
+  __macro(cudnnCreate);                                    \
+  __macro(cudnnDestroy);                                   \
+  __macro(cudnnSetStream);                                 \
+  __macro(cudnnActivationForward);                         \
+  __macro(cudnnActivationBackward);                        \
+  __macro(cudnnConvolutionForward);                        \
+  __macro(cudnnConvolutionBackwardBias);                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
+  __macro(cudnnTransformTensor);                           \
+  __macro(cudnnPoolingForward);                            \
+  __macro(cudnnPoolingBackward);                           \
+  __macro(cudnnSoftmaxBackward);                           \
+  __macro(cudnnSoftmaxForward);                            \
+  __macro(cudnnGetVersion);                                \
+  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
+  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
+  __macro(cudnnGetErrorString);                            \
+  __macro(cudnnCreateDropoutDescriptor);                   \
+  __macro(cudnnDropoutGetStatesSize);                      \
+  __macro(cudnnSetDropoutDescriptor);                      \
+  __macro(cudnnRestoreDropoutDescriptor);                  \
+  __macro(cudnnCreateRNNDescriptor);                       \
+  __macro(cudnnGetRNNParamsSize);                          \
+  __macro(cudnnGetRNNWorkspaceSize);                       \
+  __macro(cudnnGetRNNTrainingReserveSize);                 \
+  __macro(cudnnRNNForwardTraining);                        \
+  __macro(cudnnRNNBackwardData);                           \
+  __macro(cudnnRNNBackwardWeights);                        \
+  __macro(cudnnRNNForwardInference);                       \
+  __macro(cudnnDestroyDropoutDescriptor);                  \
+  __macro(cudnnDestroyRNNDescriptor);                      \
+  __macro(cudnnSetTensorNdDescriptorEx);                   \
+  __macro(cudnnAddTensor);                                 \
+  __macro(cudnnConvolutionBackwardData);                   \
+  __macro(cudnnConvolutionBackwardFilter);                 \
   __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
+  __macro(cudnnBatchNormalizationForwardTraining);         \
+  __macro(cudnnBatchNormalizationForwardInference);        \
+  __macro(cudnnBatchNormalizationBackward);                \
+  __macro(cudnnCreateActivationDescriptor);                \
+  __macro(cudnnSetActivationDescriptor);                   \
+  __macro(cudnnGetActivationDescriptor);                   \
+  __macro(cudnnDestroyActivationDescriptor);               \
+  __macro(cudnnSetRNNDescriptor_v6);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
+#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
   __macro(cudnnGetConvolutionForwardAlgorithm);          \
   __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
   __macro(cudnnSetRNNDescriptor);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
-  __macro(cudnnBatchNormalizationForwardTraining);  \
-  __macro(cudnnBatchNormalizationForwardInference); \
-  __macro(cudnnBatchNormalizationBackward);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
-  __macro(cudnnCreateActivationDescriptor); \
-  __macro(cudnnSetActivationDescriptor);    \
-  __macro(cudnnGetActivationDescriptor);    \
-  __macro(cudnnDestroyActivationDescriptor);
-CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R6
-#if CUDNN_VERSION >= 6000
-#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 7001
@@ -214,3 +187,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index fbdfc4928cf143..956acfe2771c50 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -414,12 +414,7 @@ void* GetMKLMLDsoHandle() {
 }
 
 void* GetOpDsoHandle(const std::string& dso_name) {
-#if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "Create custom cpp op outside framework do not support Apple."));
-#else
   return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
-#endif
 }
 
 void* GetNvtxDsoHandle() {
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 15de4c64e3e645..5ff4bff4bff652 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -78,6 +78,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  **/
 #define MIOPEN_DNN_ROUTINE_EACH(__macro)                  \
   __macro(miopenGetVersion);                              \
+  __macro(miopenOpTensor);                                \
   __macro(miopenSet4dTensorDescriptor);                   \
   __macro(miopenSetTensorDescriptor);                     \
   __macro(miopenInitConvolutionNdDescriptor);             \
@@ -116,7 +117,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenPoolingForward);                          \
   __macro(miopenPoolingBackward);                         \
   __macro(miopenSoftmaxBackward);                         \
+  __macro(miopenSoftmaxBackward_V2);                      \
   __macro(miopenSoftmaxForward);                          \
+  __macro(miopenSoftmaxForward_V2);                       \
   __macro(miopenCreateDropoutDescriptor);                 \
   __macro(miopenDestroyDropoutDescriptor);                \
   __macro(miopenRestoreDropoutDescriptor);                \
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index a8ad729a31a4d2..0db4cc71b1b210 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -24,7 +24,6 @@
 
 namespace Eigen {
 
-using bfloat16 = paddle::platform::bfloat16;
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 using float16 = paddle::platform::float16;
@@ -33,7 +32,8 @@ template <typename T>
 struct NumTraits;
 
 template <>
-struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
+struct NumTraits<paddle::platform::bfloat16>
+    : GenericNumTraits<paddle::platform::bfloat16> {
   enum {
     IsSigned = true,
     IsInteger = false,
@@ -41,22 +41,22 @@ struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
     RequireInitialization = false
   };
 
-  HOSTDEVICE static inline bfloat16 epsilon() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 epsilon() {
     return paddle::platform::raw_uint16_to_bfloat16(0x3400);
   }
-  HOSTDEVICE static inline bfloat16 dummy_precision() {
-    return bfloat16(1e-5f);
+  HOSTDEVICE static inline paddle::platform::bfloat16 dummy_precision() {
+    return paddle::platform::bfloat16(1e-5f);
   }
-  HOSTDEVICE static inline bfloat16 highest() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 highest() {
     return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
   }
-  HOSTDEVICE static inline bfloat16 lowest() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 lowest() {
     return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
   }
-  HOSTDEVICE static inline bfloat16 infinity() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 infinity() {
     return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
   }
-  HOSTDEVICE static inline bfloat16 quiet_NaN() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 quiet_NaN() {
     return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
   }
 };
@@ -137,68 +137,91 @@ namespace numext {
 //////////// bfloat methods /////////////
 
 template <>
-HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+HOSTDEVICE inline bool(isnan)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isnan)(a);
 }
 
 template <>
-HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+HOSTDEVICE inline bool(isinf)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isinf)(a);
 }
 
 template <>
-HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+HOSTDEVICE inline bool(isfinite)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isfinite)(a);
 }
 
 template <>
-HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
-  return bfloat16(::expf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 exp(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::expf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
-  return bfloat16(::erff(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 erf(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::erff(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
-  return bfloat16(::logf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 log(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::logf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
-  return bfloat16(::tanhf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 tanh(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::tanhf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
-  return bfloat16(::sqrtf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 sqrt(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::sqrtf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
-  return bfloat16(::ceilf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 ceil(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::ceilf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
-  return bfloat16(::floorf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 floor(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::floorf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
-  return bfloat16(::roundf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 round(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::roundf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
+HOSTDEVICE inline paddle::platform::bfloat16 pow(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return paddle::platform::bfloat16(
+      ::powf(static_cast<float>(a), static_cast<float>(b)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
-  return bfloat16(::fabs(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 abs(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::fabs(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 mini(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 maxi(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return a < b ? b : a;
 }
 
 //////////// complex64 methods /////////////
@@ -398,5 +421,15 @@ HOSTDEVICE inline float16 abs(const float16& a) {
   return float16(::fabs(static_cast<float>(a)));
 }
 
+template <>
+HOSTDEVICE inline float16 mini(const float16& a, const float16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
+  return a < b ? b : a;
+}
+
 }  // namespace numext
 }  // namespace Eigen
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 47ade89ff2df3f..f0809d34d493e9 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -45,6 +45,10 @@ limitations under the License. */
 #include <thrust/system_error.h>  // NOLINT
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#endif  // PADDLE_WITH_ASCEND_CL
+
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -970,7 +974,6 @@ DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 #endif
-
 }  // namespace details
 
 #define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                        \
@@ -1204,5 +1207,41 @@ inline void retry_sleep(unsigned millisecond) {
 #undef DEFINE_CUDA_STATUS_TYPE
 #endif  // PADDLE_WITH_HIP
 
+#ifdef PADDLE_WITH_ASCEND_CL
+namespace details {
+template <typename T>
+struct NPUStatusType {};
+
+#define DEFINE_NPU_STATUS_TYPE(type, success_value) \
+  template <>                                       \
+  struct NPUStatusType<type> {                      \
+    using Type = type;                              \
+    static constexpr Type kSuccess = success_value; \
+  }
+
+DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
+}  // namespace details
+
+inline std::string build_npu_error_msg(aclError stat) {
+  std::ostringstream sout;
+  sout << " ACL error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
+#define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
+  do {                                                         \
+    auto __cond__ = (COND);                                    \
+    using __NPU_STATUS_TYPE__ = decltype(__cond__);            \
+    constexpr auto __success_type__ =                          \
+        ::paddle::platform::details::NPUStatusType<            \
+            __NPU_STATUS_TYPE__>::kSuccess;                    \
+    if (UNLIKELY(__cond__ != __success_type__)) {              \
+      auto __summary__ = ::paddle::platform::errors::External( \
+          ::paddle::platform::build_npu_error_msg(__cond__));  \
+      __THROW_ERROR_INTERNAL__(__summary__);                   \
+    }                                                          \
+  } while (0)
+#endif  // PADDLE_WITH_ASCEND_CL
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index fa77c0be037df3..83b9544d23267b 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -45,7 +45,10 @@ DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
             "extremely slow so please use this flag wisely.");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
+// flags.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 
 /**
  * CUDA related related FLAG
@@ -84,8 +87,15 @@ DEFINE_string(selected_gpus, "",
               "share-memory only.");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_ASCEND_CL)
+DEFINE_string(selected_npus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (NPU). If you want to use "
+              "all visible devices, set this to empty string.");
+#endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 /**
  * CUDNN related FLAG
  * Name: FLAGS_cudnn_deterministic
@@ -377,7 +387,10 @@ DEFINE_double(
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
     "reserve the rest for page tables, etc");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
+// flags.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 
 /**
  * Memory related FLAG
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index d181660e311960..75e35d398c27e7 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -197,8 +197,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-#if defined(PADDLE_WITH_HIP) || \
-    (defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000)
+#if defined(PADDLE_WITH_HIP)
 ARITHMETIC_KERNEL(Add, +)
 ARITHMETIC_KERNEL(Sub, -)
 ARITHMETIC_KERNEL(Mul, *)
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 3769428c9df862..2e66e3e36d0b21 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -102,6 +102,7 @@ static int GetCUDADeviceCountImpl() {
 }
 
 int GetCUDADeviceCount() {
+  // cache the count
   static auto dev_cnt = GetCUDADeviceCountImpl();
   return dev_cnt;
 }
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ea89082733a80f..ac6988d350f4f3 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/string/split.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -63,6 +65,7 @@ namespace framework {
 
 std::once_flag gflags_init_flag;
 std::once_flag glog_init_flag;
+std::once_flag npu_init_flag;
 
 bool InitGflags(std::vector<std::string> args) {
   bool successed = false;
@@ -145,6 +148,17 @@ void InitDevices() {
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
   }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
+  platform::AclInstance::Instance();  // NOLINT
+  try {
+    // use user specified XPUs in single-node multi-process mode.
+    devices = platform::GetSelectedNPUDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING)
+        << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
+  }
 #endif
   InitDevices(devices);
 }
@@ -165,6 +179,9 @@ void InitDevices(const std::vector<int> devices) {
 #endif
 #ifdef PADDLE_WITH_XPU
     places.emplace_back(platform::XPUPlace(devices[i]));
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+    places.emplace_back(platform::NPUPlace(devices[i]));
 #endif
   }
   places.emplace_back(platform::CPUPlace());
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index 0be4233269e0f4..25ae0ab264f2d8 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -27,24 +27,38 @@ namespace paddle {
 namespace platform {
 
 void PrintVar(framework::Scope* scope, const std::string& var_name,
-              const std::string& print_info) {
+              const std::string& print_info, std::stringstream* sstream) {
   framework::Variable* var = scope->FindVar(var_name);
   if (var == nullptr) {
-    VLOG(1) << "Variable Name " << var_name << " does not exist in your scope";
+    VLOG(0) << "Variable Name " << var_name << " does not exist in your scope";
     return;
   }
   framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
   if (tensor == nullptr) {
-    VLOG(1) << "tensor of variable " << var_name
+    VLOG(0) << "tensor of variable " << var_name
             << " does not exist in your scope";
     return;
   }
 
-  std::ostringstream sstream;
-  sstream << print_info << "\t";
-  sstream << var_name << "\t";
-  sstream << *tensor << "\t";
-  std::cout << sstream.str() << std::endl;
+  *sstream << print_info << ": ";
+
+#define PrintTensorCallback(cpp_type, proto_type) \
+  do {                                            \
+    if (tensor->type() == proto_type) {           \
+      *sstream << "[";                            \
+      auto* data = tensor->data<cpp_type>();      \
+      auto element_num = tensor->numel();         \
+      if (element_num > 0) {                      \
+        *sstream << data[0];                      \
+        for (int j = 1; j < element_num; ++j) {   \
+          *sstream << " " << data[j];             \
+        }                                         \
+      }                                           \
+      *sstream << "]";                            \
+    }                                             \
+  } while (0)
+
+  _ForEachDataType_(PrintTensorCallback);
 }
 
 }  // end namespace platform
diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h
index e0bd1fff197f70..d30afb62b0b8c6 100644
--- a/paddle/fluid/platform/lodtensor_printer.h
+++ b/paddle/fluid/platform/lodtensor_printer.h
@@ -26,6 +26,6 @@ class Scope;
 namespace paddle {
 namespace platform {
 void PrintVar(framework::Scope* scope, const std::string& var_name,
-              const std::string& print_info);
+              const std::string& print_info, std::stringstream* out);
 }  // end namespace platform
 }  // end namespace paddle
diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc
index 5b2af270740766..51bd55ebb7f488 100644
--- a/paddle/fluid/platform/lodtensor_printer_test.cc
+++ b/paddle/fluid/platform/lodtensor_printer_test.cc
@@ -18,5 +18,6 @@
 
 TEST(LodTensorPrinter, PrintVar) {
   paddle::framework::Scope scope;
-  paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var");
+  std::stringstream ss;
+  paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var", &ss);
 }
diff --git a/paddle/fluid/platform/miopen_helper.h b/paddle/fluid/platform/miopen_helper.h
index 435d28d518df1a..46c7da83970419 100644
--- a/paddle/fluid/platform/miopen_helper.h
+++ b/paddle/fluid/platform/miopen_helper.h
@@ -434,9 +434,10 @@ class ScopedPoolingDescriptor {
             "The size of kernel and strides should be equal. But "
             "received size of kernel is %d, size of strides is %d.",
             kernel.size(), strides.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet2dPoolingDescriptor(
-        desc_, GetPoolingMode(mode), kernel[0], kernel[1], pads[0], pads[1],
-        strides[0], strides[1]));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetNdPoolingDescriptor(
+        desc_, GetPoolingMode(mode), kernel.size(),
+        const_cast<int*>(kernel.data()), const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data())));
     return desc_;
   }
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 20e6dfe1c39167..35776b9f1e6b88 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -439,14 +439,23 @@ inline void AppendKey(std::string* key, const std::vector<T>& dims) {
 inline void AttachPointerHashToMKLDNNKey(void* ptr,
                                          const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::MKLDNNDeviceContext* dev_ctx =
-        (platform::MKLDNNDeviceContext*)pool.Get(place);
-    dev_ctx->SetKeySuffix("E" +
-                          std::to_string(reinterpret_cast<uintptr_t>(ptr)));
-    // When NaiveExecutor/Executor is used no info on thread id is needed in a
-    // key
-    dev_ctx->DisableThreadInfoInKey();
+    // Static vars will remember first executor and its thread
+    // so both of them need to be processed by the same thread within
+    // critical section
+    static std::mutex static_vars_barrier;
+    static_vars_barrier.lock();
+    static auto first_exec = ptr;
+    static auto first_thread = ThreadIDasStr();
+    static_vars_barrier.unlock();
+
+    if (first_exec != ptr) {
+      paddle::platform::MKLDNNDeviceContext::tls().set_key_suffix(
+          "E" + std::to_string(reinterpret_cast<uintptr_t>(ptr)));
+    }
+    // For first thread
+    if (first_thread == ThreadIDasStr()) {
+      paddle::platform::MKLDNNDeviceContext::tls().disable_tid_in_key();
+    }
   }
 }
 
@@ -457,13 +466,14 @@ inline std::string CreateKey(const platform::MKLDNNDeviceContext& dev_ctx,
   key.reserve(64);
   using expand_type = int[];
   expand_type{0, (AppendKey(&key, std::forward<ArgTypes>(args)), 0)...};
-  key += dev_ctx.GetKeySuffix();
+  key += paddle::platform::MKLDNNDeviceContext::tls().get_key_suffix();
   return key;
 }
 
 inline std::string ExtendKeyWithThreadInfoIfNeeded(
     const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key) {
-  return ((dev_ctx.IsThreadIdUsedInKey() == true) &&
+  return ((paddle::platform::MKLDNNDeviceContext::tls().is_tid_used_in_key() ==
+           true) &&
           (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() ==
            platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default))
              ? key + "-t:" + ThreadIDasStr()
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index c79b642c51b1f5..0c45da63edd70e 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -638,7 +638,8 @@ class ReductionMKLDNNHandler
                          const float eps, const MKLDNNDeviceContext& dev_ctx,
                          const mkldnn::engine engine, platform::Place cpu_place,
                          const Tensor* x, const Tensor* y,
-                         const std::string& uniq_name)
+                         const std::string& uniq_name,
+                         std::vector<int64_t> output_dims)
       : platform::MKLDNNHandlerT<T, dnnl::reduction>(
             dev_ctx, engine, cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
@@ -653,20 +654,11 @@ class ReductionMKLDNNHandler
           platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
       const auto src_tz = framework::vectorize(x->dims());
-      const auto dst_tz = framework::vectorize(y->dims());
-
-      // For oneDNN dimensionality should match so we need to
-      // extend Y tensor dims with values of 1 (before and after pattern)
-      int j = 0;
-      std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-      for (size_t i = 0; i < src_tz.size(); ++i) {
-        dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-      }
 
       const auto src_md = dnnl::memory::desc(
           src_tz, platform::MKLDNNGetDataType<T>(), x->format());
       const auto dst_md = memory::desc(
-          dst_tz_ex, platform::MKLDNNGetDataType<T>(), x->format());
+          output_dims, platform::MKLDNNGetDataType<T>(), x->format());
 
       this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps);
     }
diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
index 76554012bf51e3..1b44cb196547c2 100644
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -35,3 +35,13 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size)
 DEFINE_INT_STATUS(STAT_gpu13_mem_size)
 DEFINE_INT_STATUS(STAT_gpu14_mem_size)
 DEFINE_INT_STATUS(STAT_gpu15_mem_size)
+
+// For Ascend NPU
+DEFINE_INT_STATUS(STAT_npu0_mem_size)
+DEFINE_INT_STATUS(STAT_npu1_mem_size)
+DEFINE_INT_STATUS(STAT_npu2_mem_size)
+DEFINE_INT_STATUS(STAT_npu3_mem_size)
+DEFINE_INT_STATUS(STAT_npu4_mem_size)
+DEFINE_INT_STATUS(STAT_npu5_mem_size)
+DEFINE_INT_STATUS(STAT_npu6_mem_size)
+DEFINE_INT_STATUS(STAT_npu7_mem_size)
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index b57fae9daac41f..0eb9448ce0fad4 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -187,3 +187,13 @@ class StatRegistry {
   USE_INT_STAT(STAT_gpu13_mem_size); \
   USE_INT_STAT(STAT_gpu14_mem_size); \
   USE_INT_STAT(STAT_gpu15_mem_size)
+
+#define USE_NPU_MEM_STAT            \
+  USE_INT_STAT(STAT_npu0_mem_size); \
+  USE_INT_STAT(STAT_npu1_mem_size); \
+  USE_INT_STAT(STAT_npu2_mem_size); \
+  USE_INT_STAT(STAT_npu3_mem_size); \
+  USE_INT_STAT(STAT_npu4_mem_size); \
+  USE_INT_STAT(STAT_npu5_mem_size); \
+  USE_INT_STAT(STAT_npu6_mem_size); \
+  USE_INT_STAT(STAT_npu7_mem_size)
diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc
new file mode 100644
index 00000000000000..3814faa7662fc5
--- /dev/null
+++ b/paddle/fluid/platform/npu_info.cc
@@ -0,0 +1,412 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/npu_info.h"
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/string/split.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_uint64(gpu_memory_limit_mb);
+DECLARE_string(selected_npus);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+USE_NPU_MEM_STAT;
+
+namespace paddle {
+namespace platform {
+
+static int GetNPUDeviceCountImpl() {
+  uint32_t count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count));
+  return count;
+}
+
+int GetNPUDeviceCount() {
+  static auto dev_cnt = GetNPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int NPUCanAccessPeer(int src, int dst) {
+  int can = 0;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst));
+  return can;
+}
+
+// For example, "1.0.1"
+std::string GetNPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  int major = 0, minor = 0, patch = 0;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch));
+  return string::Sprintf("%d.%d.%d", major, minor, patch);
+}
+
+int GetCurrentNPUDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id));
+  return device_id;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedNPUDevices() {
+  // use user specified NPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_npus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_npus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetNPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void SetNPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  // NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice
+  // pairly.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id));
+}
+
+void ResetNPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id));
+}
+
+void NPUMemoryUsage(size_t *available, size_t *total) {
+  size_t actual_available, actual_total;
+  RecordedNPUMemGetInfo(available, total, &actual_available, &actual_total,
+                        platform::GetCurrentNPUDeviceId());
+}
+
+size_t NPUAvailableMemToAlloc() {
+  size_t total = 0;
+  size_t available = 0;
+  NPUMemoryUsage(&available, &total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = NPUMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
+}
+
+size_t NPUMaxAllocSize() {
+  return std::max(NPUInitAllocSize(), NPUReallocSize());
+}
+
+static size_t NPUAllocSize(bool realloc) {
+  size_t available_to_alloc = NPUAvailableMemToAlloc();
+  PADDLE_ENFORCE_GT(
+      available_to_alloc, 0,
+      platform::errors::ResourceExhausted("Not enough available NPU memory."));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(
+      available_to_alloc, alloc_bytes,
+      platform::errors::ResourceExhausted("Not enough available NPU memory."));
+  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
+           << " MiB, is it Re-alloc: " << realloc;
+  return alloc_bytes;
+}
+
+size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
+
+size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
+
+size_t NPUMinChunkSize() {
+  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
+  // though no document specify that explicitly.
+  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
+  // details.
+  return 1 << 9;
+}
+
+size_t NPUMaxChunkSize() {
+  size_t max_chunk_size = NPUMaxAllocSize();
+  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+void NPUMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum aclrtMemcpyKind kind, aclrtStream stream,
+                    size_t dst_max_count) {
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
+          << kind << " " << stream;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
+}
+
+void NPUMemcpySync(void *dst, const void *src, size_t count,
+                   enum aclrtMemcpyKind kind, size_t dst_max_count) {
+  // NOTE(zhiqiu):  The default max_count is count
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
+}
+
+void NPUMemcpyPeerASync(void *dst, int dst_device, const void *src,
+                        size_t count, enum aclrtMemcpyKind kind,
+                        aclrtStream stream, size_t dst_max_count) {
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
+}
+
+void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, size_t count,
+                       enum aclrtMemcpyKind kind, size_t dst_max_count) {
+  // NOTE(zhiqiu):  The default max_count is count
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
+}
+
+void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
+                    size_t max_count) {
+  max_count = max_count ? max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemsetAsync(dst, max_count, value, count, stream));
+}
+
+void NPUStreamSync(aclrtStream stream) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
+}
+
+static void RaiseNonOutOfMemoryError(aclError *status) {
+  if (*status == ACL_ERROR_BAD_ALLOC) {
+    *status = ACL_ERROR_NONE;
+  }
+  PADDLE_ENFORCE_NPU_SUCCESS(*status);
+}
+
+class RecordedNPUMallocHelper {
+ private:
+  explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0)
+      : dev_id_(dev_id), limit_size_(limit_size) {
+    if (NeedRecord()) {
+      mtx_.reset(new std::mutex());
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper);
+
+ public:
+  static RecordedNPUMallocHelper *Instance(int dev_id) {
+    std::call_once(once_flag_, [] {
+      int dev_cnt = GetNPUDeviceCount();
+      instances_.reserve(dev_cnt);
+      for (int i = 0; i < dev_cnt; ++i) {
+        // NOTE(zhiqiu): share the flags with gpu, avoid more flags.
+        instances_.emplace_back(
+            new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
+      }
+    });
+
+    PADDLE_ENFORCE_GE(
+        dev_id, 0,
+        platform::errors::OutOfRange(
+            "Device id must be not less than 0, but got %d.", dev_id));
+    PADDLE_ENFORCE_LT(
+        dev_id, instances_.size(),
+        platform::errors::OutOfRange("Device id %d exceeds npu card number %d.",
+                                     dev_id, instances_.size()));
+    return instances_[dev_id].get();
+  }
+
+  /**
+   * Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC
+   * or ACL_ERROR_NONE would be returned.
+   */
+  aclError Malloc(void **ptr, size_t size) {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
+      return ACL_ERROR_BAD_ALLOC;
+    }
+
+    NPUDeviceGuard guard(dev_id_);
+    auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    if (result == ACL_ERROR_NONE) {
+      if (NeedRecord()) {
+        cur_size_ += size;
+      }
+      STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
+      return result;
+    } else {
+      RaiseNonOutOfMemoryError(&result);
+      // Non out of memory error would be raised inside
+      // RaiseNonOutOfMemoryError. Therefore, we can
+      // return cudaErrorMemoryAllocation directly here.
+      return ACL_ERROR_BAD_ALLOC;
+    }
+  }
+
+  /**
+   * Free gpu memory. Usually, free is not allowed to raise error.
+   * If it does raise error, the process should be crashed.
+   */
+  void Free(void *ptr, size_t size) {
+    NPUDeviceGuard guard(dev_id_);
+    auto result = aclrtFree(ptr);
+    PADDLE_ENFORCE_NPU_SUCCESS(result);
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      cur_size_ -= size;
+    }
+    STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
+  }
+
+  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                  size_t *actual_total) {
+    {
+      NPUDeviceGuard guard(dev_id_);
+      auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total);
+      if (result != ACL_ERROR_NONE) {
+        *actual_avail = 0;
+      }
+      RaiseNonOutOfMemoryError(&result);
+    }
+
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_);
+      *total = std::min(*actual_total, limit_size_);
+      return *total < *actual_total;
+    } else {
+      *avail = *actual_avail;
+      *total = *actual_total;
+      return false;
+    }
+  }
+
+  inline bool NeedRecord() const { return limit_size_ != 0; }
+
+  uint64_t RecordedSize() const {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    return NeedRecord() ? cur_size_ : 0;
+  }
+
+  uint64_t LimitSize() const { return limit_size_; }
+
+ private:
+  const int dev_id_;
+  const uint64_t limit_size_;
+  uint64_t cur_size_{0};
+
+  mutable std::unique_ptr<std::mutex> mtx_;
+
+  static std::once_flag once_flag_;
+  static std::vector<std::unique_ptr<RecordedNPUMallocHelper>> instances_;
+};
+
+std::once_flag RecordedNPUMallocHelper::once_flag_;
+std::vector<std::unique_ptr<RecordedNPUMallocHelper>>
+    RecordedNPUMallocHelper::instances_;
+
+aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size);
+}
+
+void RecordedNPUFree(void *p, size_t size, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size);
+}
+
+bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo(
+      avail, total, actual_avail, actual_total);
+}
+
+uint64_t RecordedNPUMallocSize(int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize();
+}
+
+bool IsNPUMallocRecorded(int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
+}
+
+AclInstance::~AclInstance() {}
+
+AclInstance &AclInstance::Instance() {
+  static AclInstance instance;
+  return instance;
+}
+
+AclInstance::AclInstance() {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
+  VLOG(4) << "Call aclrtSetDevice ";
+  // NOTE(zhiqiu): why set devices here?
+  // Because ACL creates a default context which contains 2 streams
+  // when calling aclrtSetDeviceId, so usually we do not need to
+  // create contexts explicitly. And, for each device, aclrtSetDeviceId
+  // need to call parily with aclrtResetDeviceId to destory the default
+  // context. Here, we use this singleton and static instance to manage
+  // the devices to make sure they will be resetted before program exit.
+  devices_ = platform::GetSelectedNPUDevices();
+  for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
+    SetNPUDeviceId(*it);
+    VLOG(4) << "Call aclrtSetDevice " << *it;
+  }
+}
+
+void AclInstance::Finalize() {
+  // NOTE(zhiqiu): DO NOT perform finalize in destructor
+  // to avoid problems caused by destructor order of static
+  // object.
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    auto status = aclrtResetDevice(devices_[i]);
+    VLOG(4) << "Call aclrtResetDevice " << devices_[i]
+            << " status = " << status;
+  }
+  auto status = aclFinalize();
+  VLOG(4) << "Call aclFinalize, status = " << status;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h
new file mode 100644
index 00000000000000..1f392f1a534864
--- /dev/null
+++ b/paddle/fluid/platform/npu_info.h
@@ -0,0 +1,156 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <stddef.h>
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of NPU devices in system.
+int GetNPUDeviceCount();
+
+//! Get the runtime version of the ith NPU
+std::string GetNPURuntimeVersion(int id);
+//! Check if this device can access peer or not.
+int NPUCanAccessPeer(int src, int dst);
+
+//! Get the current NPU device id in system.
+int GetCurrentNPUDeviceId();
+
+//! Get the current NPU stream.
+int GetCurrentStream();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedNPUDevices();
+
+//! Set the NPU device id for next execution.
+void SetNPUDeviceId(int device_id);
+
+//! Reset the NPU device id for next execution.
+void ResetNPUDeviceId(int device_id);
+
+//! Get the memory usage of current NPU device.
+void NPUMemoryUsage(size_t *available, size_t *total);
+
+//! Get the available memory to allocate, which is the size of available npu
+//! minus reserving.
+size_t NPUAvailableMemToAlloc();
+
+//! Get the maximum allocation size of current NPU device.
+size_t NPUMaxAllocSize();
+
+//! Get the initial allocation size of current NPU device.
+size_t NPUInitAllocSize();
+
+//! Get the re-allocation size of current NPU device.
+size_t NPUReallocSize();
+
+//! Get the minimum chunk size for NPU buddy allocator.
+size_t NPUMinChunkSize();
+
+//! Get the maximum chunk size for NPU buddy allocator.
+size_t NPUMaxChunkSize();
+
+//! Copy memory from address src to dst asynchronously.
+void NPUMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum aclrtMemcpyKind kind, aclrtStream stream,
+                    size_t dst_max_count = 0);
+
+//! Copy memory from address src to dst synchronously.
+void NPUMemcpySync(void *dst, const void *src, size_t count,
+                   enum aclrtMemcpyKind kind, size_t dst_max_count = 0);
+
+//! Set memory dst with value count size asynchronously
+void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
+                    size_t max_count = 0);
+
+//! Copy memory from one device to another device asynchronously.
+void NPUMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, aclrtStream stream,
+                        size_t max_count = 0);
+
+//! Copy memory from one device to another device synchronously.
+void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count, size_t max_count = 0);
+
+//! Blocks until stream has completed all operations.
+void NPUStreamSync(aclrtStream stream);
+
+//! aclrtMalloc with recorded info
+aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id);
+
+//! aclrtFree with recorded info
+void RecordedNPUFree(void *p, size_t size, int dev_id);
+
+//! Get available and total gpu memory with considering limitation
+bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id);
+
+//! Get recorded actrtMalloc size. If record is disabled, return 0.
+uint64_t RecordedNPUMallocSize(int dev_id);
+
+bool IsNPUMallocRecorded(int dev_id);
+
+class NPUDeviceGuard {
+ public:
+  explicit inline NPUDeviceGuard(int dev_id) {
+    int prev_id = platform::GetCurrentNPUDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      platform::SetNPUDeviceId(dev_id);
+    }
+  }
+
+  inline ~NPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      platform::SetNPUDeviceId(prev_id_);
+    }
+  }
+
+  NPUDeviceGuard(const NPUDeviceGuard &o) = delete;
+  NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+class AclInstance {
+ public:
+  // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
+  // no PADDLE_ENFORCE here, call acl API directly.
+  ~AclInstance();
+  AclInstance(const AclInstance &o) = delete;
+  const AclInstance &operator=(const AclInstance &o) = delete;
+  static AclInstance &Instance();
+  void Finalize();
+
+ private:
+  // forbid calling default constructor
+  AclInstance();
+  std::vector<int> devices_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index b80d2fd1632cd8..1cc9fd9fe76341 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -33,6 +33,7 @@ class PlacePrinter : public boost::static_visitor<> {
     os_ << "CUDAPlace(" << p.device << ")";
   }
   void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
+  void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -49,6 +50,10 @@ bool is_xpu_place(const Place &p) {
   return boost::apply_visitor(IsXPUPlace(), p);
 }
 
+bool is_npu_place(const Place &p) {
+  return boost::apply_visitor(IsNPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
   return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -67,6 +72,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
       return true;
     } else if (is_xpu_place(p1)) {
       return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
+    } else if (is_npu_place(p1)) {
+      return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
     } else {
       return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
     }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index e11ca4159e07e9..f20fac477d0ec4 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -72,16 +72,31 @@ struct XPUPlace {
   int device;
 };
 
+struct NPUPlace {
+  NPUPlace() : NPUPlace(0) {}
+  explicit NPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const NPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const NPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const NPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &gpu) const { return true; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const CPUPlace &) const { return true; }
   bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -89,27 +104,38 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
 struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
 
 struct IsXPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &xpu) const { return true; }
+  bool operator()(const XPUPlace &) const { return true; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
-class Place
-    : public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
+struct IsNPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
+                                    CUDAPinnedPlace> {
  private:
   using PlaceBase =
-      boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
+      boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;
 
  public:
   Place() = default;
   Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
   Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
+  Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
@@ -126,6 +152,7 @@ using PlaceList = std::vector<Place>;
 
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
+bool is_npu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
@@ -153,6 +180,16 @@ struct PlaceVisitorWrapper
 #endif
   }
 
+  typename Visitor::result_type operator()(const NPUPlace &npu) const {
+#ifdef PADDLE_WITH_ASCEND
+    return visitor_(npu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with NPU. Cannot visit npu device"));
+    return typename Visitor::result_type();
+#endif
+  }
+
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     return visitor_(cuda);
diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt
index c0595eb415da6c..e1e3e49ce9cbc0 100644
--- a/paddle/fluid/platform/stream/CMakeLists.txt
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
@@ -1,3 +1,7 @@
 IF(WITH_GPU OR WITH_ROCM)
 cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost)
 ENDIF()
+
+IF(WITH_ASCEND_CL)
+cc_library(npu_stream SRCS npu_stream.cc DEPS enforce boost stream_callback_manager)
+ENDIF()
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index fc51a08c2aa248..6c6a47fadb5f40 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -49,8 +49,8 @@ bool CUDAStream::Init(const Place& place, const Priority& priority) {
         cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
 #endif
   }
-  callback_manager_.reset(new StreamCallbackManager(stream_));
-  VLOG(3) << "CUDAStream Init stream: " << stream_
+  callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
+  VLOG(3) << "GPUStream Init stream: " << stream_
           << ", priority: " << static_cast<int>(priority);
   return true;
 }
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index d9375492519d8c..46bbe94b080f96 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -101,7 +101,7 @@ class CUDAStream final {
   cudaStream_t stream_{nullptr};
 #endif
   Priority priority_{Priority::kNormal};
-  std::unique_ptr<StreamCallbackManager> callback_manager_;
+  std::unique_ptr<StreamCallbackManager<gpuStream_t>> callback_manager_;
 
   DISABLE_COPY_AND_ASSIGN(CUDAStream);
 };
diff --git a/paddle/fluid/platform/stream/npu_stream.cc b/paddle/fluid/platform/stream/npu_stream.cc
new file mode 100644
index 00000000000000..1c3e153e58cda7
--- /dev/null
+++ b/paddle/fluid/platform/stream/npu_stream.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/stream/npu_stream.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+bool NPUStream::Init(const Place& place) {
+  PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                    platform::errors::InvalidArgument(
+                        "NPU stream must be created using npu place."));
+  place_ = place;
+  NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device);
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream_));
+  callback_manager_.reset(new StreamCallbackManager<aclrtStream>(stream_));
+  VLOG(3) << "NPUStream Init stream: " << stream_;
+  return true;
+}
+
+void NPUStream::Destroy() {
+  NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device);
+  Wait();
+  WaitCallback();
+  if (stream_) {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream_));
+  }
+  stream_ = nullptr;
+}
+
+void NPUStream::Wait() const {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_));
+}
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream/npu_stream.h b/paddle/fluid/platform/stream/npu_stream.h
new file mode 100644
index 00000000000000..7e5d574acecf54
--- /dev/null
+++ b/paddle/fluid/platform/stream/npu_stream.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream_callback_manager.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class NPUStream final {
+ public:
+  NPUStream() = default;
+  explicit NPUStream(const Place& place) { Init(place); }
+  virtual ~NPUStream() { Destroy(); }
+
+  bool Init(const Place& place);
+
+  template <typename Callback>
+  void AddCallback(Callback&& callback) const {
+    callback_manager_->AddCallback(callback);
+  }
+
+  template <typename Callback>
+  void RecordEvent(aclrtEvent ev, Callback callback) const {
+    callback();
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(ev, stream_));
+  }
+
+  void RecordEvent(aclrtEvent ev) const {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(ev, stream_));
+  }
+
+  void WaitEvent(aclrtEvent ev) const {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream_, ev));
+  }
+
+  void Wait() const;
+  void WaitCallback() const { callback_manager_->Wait(); }
+
+  aclrtStream raw_stream() const { return stream_; }
+  void Destroy();
+
+ private:
+  Place place_;
+  aclrtStream stream_{nullptr};
+  std::unique_ptr<StreamCallbackManager<aclrtStream>> callback_manager_;
+
+  DISABLE_COPY_AND_ASSIGN(NPUStream);
+};
+
+#endif
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index d6b106dc582d51..287c8fc37e005a 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -21,11 +21,18 @@ namespace platform {
 #ifdef PADDLE_WITH_HIP
 static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status,
                                void *user_data)
-#elif CUDA_VERSION >= 10000
-static void CUDART_CB StreamCallbackFunc(void *user_data)
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+    static void CUDART_CB StreamCallbackFunc(void *user_data)
 #else
-static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                         cudaError_t status, void *user_data)
+    static void CUDART_CB
+    StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data)
+#endif
+#endif
+
+#if PADDLE_WITH_ASCEND_CL
+        static void StreamCallbackFunc(void *user_data)
 #endif
 {
   std::unique_ptr<std::function<void()>> func(
@@ -33,10 +40,13 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
   (*func)();
 }
 
-StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream)
+template <typename Stream>
+StreamCallbackManager<Stream>::StreamCallbackManager(const Stream stream)
     : stream_(stream), thread_pool_(1) {}
 
-void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
+template <typename Stream>
+void StreamCallbackManager<Stream>::AddCallback(
+    std::function<void()> callback) const {
   auto *callback_func = new std::function<void()>(std::move(callback));
   auto *func = new std::function<void()>([this, callback_func] {
     std::lock_guard<std::mutex> lock(mtx_);
@@ -45,23 +55,37 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
       (*callback_func)();
     });
   });
+
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_CUDA_SUCCESS(
       hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
-#elif CUDA_VERSION >= 10000
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
+#endif
+
+#if PADDLE_WITH_ASCEND_CL
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func,
+                                                 ACL_CALLBACK_BLOCK, stream_));
+#endif
 }
 
-void StreamCallbackManager::Wait() const {
+template <typename Stream>
+void StreamCallbackManager<Stream>::Wait() const {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
-#else
+#endif
+#ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_));
 #endif
   {
     std::lock_guard<std::mutex> lock(mtx_);
@@ -71,5 +95,15 @@ void StreamCallbackManager::Wait() const {
   }
 }
 
+#ifdef PADDLE_WITH_CUDA
+template struct StreamCallbackManager<gpuStream_t>;
+#endif
+#ifdef PADDLE_WITH_HIP
+template struct StreamCallbackManager<hipStream_t>;
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+template struct StreamCallbackManager<aclrtStream>;
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 56e8f83b5a51c1..1b960f188ec304 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -37,9 +37,10 @@ namespace platform {
 
 // NOTE(zjl): clean StreamCallbackManager to make compilation faster
 // Make StreamCallbackManager thread-safe
+template <typename Stream>
 class StreamCallbackManager {
  public:
-  explicit StreamCallbackManager(const gpuStream_t stream);
+  explicit StreamCallbackManager(const Stream stream);
 
   ~StreamCallbackManager() = default;
 
@@ -48,7 +49,7 @@ class StreamCallbackManager {
   void Wait() const;
 
  private:
-  const gpuStream_t stream_;
+  const Stream stream_;
   mutable ::ThreadPool thread_pool_;
   mutable std::mutex mtx_;
   mutable std::future<void> last_future_;
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5c9655edfb71fa..b43ad592a3a253 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -7,6 +7,10 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
 
+if (WITH_PSCORE)
+  set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
+  set(PYBIND_DEPS ${PYBIND_DEPS} graph_py_service)
+endif()
 if (WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
@@ -32,6 +36,7 @@ endif(NOT WIN32)
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
+  list(APPEND PYBIND_DEPS py_layer_op)
 endif()
 
 set(PYBIND_SRCS
@@ -71,7 +76,7 @@ endif (WITH_CRYPTO)
 if (WITH_PSCORE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
   set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  list(APPEND PYBIND_DEPS fleet communicator)
+  list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler)
   list(APPEND PYBIND_SRCS fleet_py.cc)
 endif()
 
@@ -105,7 +110,7 @@ if(WITH_PYTHON)
   set(tmp_impl_file ${impl_file}.tmp)
 
   if(WIN32)
-      if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
       set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
     else()
       set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
index 00eca380859527..303ab5c0fe8ca4 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -32,6 +32,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#include "paddle/fluid/platform/ascend_npu_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
 
 using namespace ge;  // NOLINT
@@ -40,6 +42,12 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
+#ifdef PADDLE_WITH_ASCEND_STRING
+using AscendString = AscendString;
+#else
+using AscendString = std::string;
+#endif
+
 void BindAscendWrapper(py::module *m) {
   py::class_<framework::AscendInstance,
              std::shared_ptr<framework::AscendInstance>>(*m, "AscendInstance")
@@ -47,13 +55,31 @@ void BindAscendWrapper(py::module *m) {
       .def("init_global_resources",
            &framework::AscendInstance::InitGlobalResouces,
            py::call_guard<py::gil_scoped_release>())
+      .def("destroy_global_resources",
+           &framework::AscendInstance::DestroyGlobalResouces,
+           py::call_guard<py::gil_scoped_release>())
       .def("add_ascend_subgraph", &framework::AscendInstance::AddAscendSubgraph,
            py::call_guard<py::gil_scoped_release>());
-}  // end AscendWrapper
+}
 
-Status ge_initialize(std::map<std::string, std::string> &options) {  // NOLINT
+std::map<AscendString, AscendString> convert_map(
+    const std::map<std::string, std::string> &options) {
+  std::map<AscendString, AscendString> rets;
+  for (auto &option : options) {
+    AscendString key = option.first.c_str();
+    AscendString val = option.second.c_str();
+    rets[key] = val;
+  }
+  return rets;
+}
+
+ge::Status ge_initialize(
+    std::map<std::string, std::string> &options) {  // NOLINT
   py::gil_scoped_release release;
-  Status res = GEInitialize(options);
+  auto init_options = convert_map(options);
+  ge::Status res = ge::GEInitialize(init_options);
+  PADDLE_ENFORCE_EQ(res, ge::SUCCESS, platform::errors::Fatal(
+                                          "ge initialize not success:%d", res));
   py::gil_scoped_acquire acquire;
   return res;
 }
@@ -82,11 +108,18 @@ enum AttrType {
   AT_NAMEATTR
 };
 
+void BindAscendDevice(py::module *m) {
+  py::class_<platform::ascend::NPUDevice>(*m, "NPUDevice")
+      .def_static(
+          "get_device_count",
+          static_cast<int (*)()>(&platform::ascend::NPUDevice::GetDeviceCount));
+}
+
 void BindAscendGraph(py::module *m) {
   m->def("ge_initialize", &ge_initialize, "GEInitialize");
   m->def("ge_finalize", &GEFinalize, "GEFinalize");
 
-  //枚举封装
+  // enum
   py::enum_<GraphRunMode>(*m, "GEGraphRunMode")
       .value("PREDICTION", GraphRunMode::PREDICTION)
       .value("TRAIN", GraphRunMode::TRAIN)
@@ -214,24 +247,34 @@ void BindAscendGraph(py::module *m) {
 
   // 类封装
   py::class_<Session>(*m, "GESession")
-      .def(py::init<const std::map<std::string, std::string> &>())
+      .def(py::init([](const std::map<std::string, std::string> &options) {
+        return std::unique_ptr<ge::Session>(
+            new ge::Session(convert_map(options)));
+      }))
+      .def("add_graph", (ge::Status (Session::*)(uint32_t, const Graph &)) &
+                            Session::AddGraph)
       .def("add_graph",
-           (Status (Session::*)(uint32_t, const Graph &)) & Session::AddGraph)
-      .def("add_graph",
-           (Status (Session::*)(uint32_t, const Graph &,
-                                const std::map<std::string, std::string> &)) &
-               Session::AddGraph)
+           [](Session &ss, uint32_t index, const Graph &graph,
+              const std::map<std::string, std::string> &options) {
+             return ss.AddGraph(index, graph, convert_map(options));
+           })
       .def("remove_graph", &Session::RemoveGraph)
       .def("run_graph",
            [](Session &ss, uint32_t graphId,
               const std::vector<Tensor> &inputs) -> py::tuple {
              std::vector<Tensor> outputs;
-             Status res = ss.RunGraph(graphId, inputs, outputs);
+             ge::Status res = ss.RunGraph(graphId, inputs, outputs);
              return py::make_tuple(outputs, res);
            },
            py::call_guard<py::gil_scoped_release>())
       .def("build_graph", &Session::BuildGraph)
       .def("run_graph_async", &Session::RunGraphAsync)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("register_call_back_func",
+           static_cast<ge::Status (ge::Session::*)(  // NOLINT
+               const char *, const ge::session::pCallBackFunc &)>(
+               &ge::Session::RegisterCallBackFunc))
+#else
       .def("register_call_back_func",
            (Status (Session::*)(  // NOLINT
                const std::string &,
@@ -239,11 +282,12 @@ void BindAscendGraph(py::module *m) {
                    uint32_t graph_id,
                    const std::map<std::string, ge::Tensor> &params_list)>)) &
                Session::RegisterCallBackFunc)
+#endif
       .def("is_graph_need_rebuild", &Session::IsGraphNeedRebuild);
 
   py::class_<Graph>(*m, "GEGraph")
       .def(py::init<>())
-      .def(py::init<const std::string &>())
+      .def(py::init<const char *>())
       .def("set_inputs", &Graph::SetInputs)
       .def("set_outputs", (Graph & (Graph::*)(const std::vector<Operator> &)) &
                               Graph::SetOutputs)
@@ -253,40 +297,70 @@ void BindAscendGraph(py::module *m) {
                Graph::SetOutputs)
       .def("set_outputs",
            (Graph &
-            (Graph::*)(const std::vector<std::pair<ge::Operator, std::string>>
+            (Graph::*)(const std::vector<std::pair<ge::Operator, AscendString>>
                            &)) &
                Graph::SetOutputs)
       .def("set_targets", &Graph::SetTargets)
       .def("is_valid", &Graph::IsValid)
       .def("add_op", &Graph::AddOp)
       .def("find_op_by_name",
-           [](Graph &graph, const std::string &name) -> py::tuple {
+           [](Graph &graph, const char *name) -> py::tuple {
              ge::Operator op;
              graphStatus status = graph.FindOpByName(name, op);
              return py::make_tuple(op, status);
            })
       .def("find_op_by_type",
-           [](Graph &graph, const std::string &type) -> py::tuple {
+           [](Graph &graph, const char *type) -> py::tuple {
              std::vector<ge::Operator> ops;
              graphStatus status = graph.FindOpByType(type, ops);
              return py::make_tuple(ops, status);
            })
       .def("get_all_op_name",
            [](Graph &graph) -> py::tuple {
-             std::vector<std::string> op_name;
+             std::vector<AscendString> op_name;
              graphStatus status = graph.GetAllOpName(op_name);
              return py::make_tuple(op_name, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("save_to_file",
+           static_cast<ge::graphStatus (ge::Graph::*)(const char *) const>(
+               &ge::Graph::SaveToFile))
+      .def("load_from_file",
+           static_cast<ge::graphStatus (ge::Graph::*)(const char *)>(
+               &Graph::LoadFromFile))
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::Graph::*)(AscendString &) const>(
+               &Graph::GetName))
+#else
       .def("save_to_file", &Graph::SaveToFile)
       .def("load_from_file", &Graph::LoadFromFile)
       .def("get_name", &Graph::GetName)
+#endif
       .def("set_need_iteration", &Graph::SetNeedIteration);
 
   py::class_<Operator>(*m, "GEOperator")
       .def(py::init<>())
-      .def(py::init<const std::string &>())
-      .def(py::init<const std::string &, const std::string &>())
+      .def(py::init<const char *>())
+      .def(py::init<const char *, const char *>())
       .def("is_empty", &Operator::IsEmpty)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::Operator::*)(AscendString &) const>(
+               &Operator::GetName))
+      .def("get_op_type",
+           static_cast<ge::graphStatus (ge::Operator::*)(AscendString &) const>(
+               &Operator::GetOpType))
+      .def("set_input",
+           (Operator & (Operator::*)(const char *, const Operator &)) &
+               Operator::SetInput)
+      .def("set_input",
+           (Operator &
+            (Operator::*)(const char *, const Operator &, const char *)) &
+               Operator::SetInput)
+      .def("set_input", (Operator & (Operator::*)(const char *,
+                                                  const Operator &, uint32_t)) &
+                            Operator::SetInput)
+#else
       .def("get_name", &Operator::GetName)
       .def("get_op_type", &Operator::GetOpType)
       .def("set_input",
@@ -299,13 +373,28 @@ void BindAscendGraph(py::module *m) {
       .def("set_input", (Operator & (Operator::*)(const std::string &,
                                                   const Operator &, uint32_t)) &
                             Operator::SetInput)
+#endif
       .def("add_control_input", &Operator::AddControlInput)
       .def("get_input_const_data",
-           [](Operator &op, const std::string &dst_name) -> py::tuple {
+           [](Operator &op, const char *dst_name) -> py::tuple {
              Tensor data;
              graphStatus res = op.GetInputConstData(dst_name, data);
              return py::make_tuple(data, res);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_input_desc",
+           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
+      .def("get_input_desc",
+           [](Operator &op, const std::string &name) {
+             return op.GetInputDescByName(name.c_str());
+           })
+      .def("get_dynamic_output_num",
+           static_cast<int (ge::Operator::*)(const char *) const>(
+               &Operator::GetDynamicOutputNum))
+      .def("get_dynamic_input_num",
+           static_cast<int (ge::Operator::*)(const char *) const>(
+               &Operator::GetDynamicInputNum))
+#else
       .def("get_input_desc",
            (TensorDesc (Operator::*)(const std::string &) const) &
                Operator::GetInputDesc)
@@ -313,12 +402,41 @@ void BindAscendGraph(py::module *m) {
            (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
       .def("get_dynamic_output_num", &Operator::GetDynamicOutputNum)
       .def("get_dynamic_input_num", &Operator::GetDynamicInputNum)
+#endif
       .def("try_get_input_desc",
-           [](Operator &op, const std::string &name) -> py::tuple {
+           [](Operator &op, const char *name) -> py::tuple {
              TensorDesc tensor_desc;
              graphStatus status = op.TryGetInputDesc(name, tensor_desc);
              return py::make_tuple(tensor_desc, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("update_input_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               const char *, const TensorDesc &)>(&Operator::UpdateInputDesc))
+      .def("get_output_desc",
+           [](Operator &op, const std::string &name) {
+             return op.GetOutputDescByName(name.c_str());
+           })
+      .def("get_output_desc",
+           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
+      .def("update_output_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               const char *, const TensorDesc &)>(&Operator::UpdateOutputDesc))
+      .def("get_dynamic_input_desc",
+           static_cast<ge::TensorDesc (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicInputDesc))
+      .def("update_dynamic_input_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(const char *, uint32_t,
+                                                         const TensorDesc &)>(
+               &Operator::UpdateDynamicInputDesc))
+      .def("get_dynamic_output_desc",
+           static_cast<ge::TensorDesc (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicOutputDesc))
+      .def("update_dynamic_output_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(const char *, uint32_t,
+                                                         const TensorDesc &)>(
+               &Operator::UpdateDynamicOutputDesc))
+#else
       .def("update_input_desc", &Operator::UpdateInputDesc)
       .def("get_output_desc",
            (TensorDesc (Operator::*)(const std::string &) const) &
@@ -330,33 +448,38 @@ void BindAscendGraph(py::module *m) {
       .def("update_dynamic_input_desc", &Operator::UpdateDynamicInputDesc)
       .def("get_dynamic_output_desc", &Operator::GetDynamicOutputDesc)
       .def("update_dynamic_output_desc", &Operator::UpdateDynamicOutputDesc)
+#endif
       .def("infer_shape_and_type", &Operator::InferShapeAndType)
       .def("set_inference_context", &Operator::SetInferenceContext)
       .def("get_inference_context", &Operator::GetInferenceContext)
       .def("verify_all_attr", &Operator::VerifyAllAttr)
       .def("get_inputs_size", &Operator::GetInputsSize)
       .def("get_outputs_size", &Operator::GetOutputsSize)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_all_attr_names_and_types",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               std::map<AscendString, AscendString> &) const>(
+               &Operator::GetAllAttrNamesAndTypes))
+#else
       .def("get_all_attr_names_and_types", &Operator::GetAllAttrNamesAndTypes)
+#endif
       .def("set_attr_int64",
-           [](Operator &op, const std::string &name,
-              int64_t value) -> Operator & {
+           [](Operator &op, const char *name, int64_t value) -> Operator & {
              int64_t tar = (int64_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_int32",
-           [](Operator &op, const std::string &name,
-              int32_t value) -> Operator & {
+           [](Operator &op, const char *name, int32_t value) -> Operator & {
              int32_t tar = (int32_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_uint32",
-           [](Operator &op, const std::string &name,
-              uint32_t value) -> Operator & {
+           [](Operator &op, const char *name, uint32_t value) -> Operator & {
              uint32_t tar = (uint32_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_int64",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<int64_t> &value) -> Operator & {
              int len = value.size();
              std::vector<int64_t> tar;
@@ -368,7 +491,7 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_int32",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<int32_t> &value) -> Operator & {
              int len = value.size();
              std::vector<int32_t> tar;
@@ -380,7 +503,7 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_uint32",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<uint32_t> &value) -> Operator & {
              int len = value.size();
              std::vector<uint32_t> tar;
@@ -392,21 +515,20 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_list_int64",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               std::initializer_list<int64_t> &attrValue) -> Operator & {
              return op.SetAttr(name, std::move(attrValue));
            })
       .def("set_attr_attrvalue",
-           [](Operator &op, const std::string &name, AttrValue &attrValue)
+           [](Operator &op, const char *name, AttrValue &attrValue)
                -> Operator & { return op.SetAttr(name, std::move(attrValue)); })
-      .def(
-          "set_attr_float",
-          [](Operator &op, const std::string &name, float value) -> Operator & {
-            float tar = static_cast<float>(value);
-            return op.SetAttr(name, tar);
-          })
+      .def("set_attr_float",
+           [](Operator &op, const char *name, float value) -> Operator & {
+             float tar = static_cast<float>(value);
+             return op.SetAttr(name, tar);
+           })
       .def("set_attr_vec_float",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<float> &value) -> Operator & {
              int len = value.size();
              std::vector<float> tar;
@@ -417,6 +539,15 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_string",
+           (Operator & (Operator::*)(const char *, const char *)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_string",
+           (Operator &
+            (Operator::*)(const char *, const std::vector<AscendString> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_string", (Operator & (Operator::*)(const std::string &,
                                                         const std::string &)) &
                                   Operator::SetAttr)
@@ -424,15 +555,16 @@ void BindAscendGraph(py::module *m) {
            (Operator & (Operator::*)(const std::string &,
                                      const std::vector<std::string> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_bool",
-           [](Operator &op, const std::string &name, bool value) -> Operator & {
+           [](Operator &op, const char *name, bool value) -> Operator & {
              if (value)
                return op.SetAttr(name, true);
              else
                return op.SetAttr(name, false);
            })
       .def("set_attr_vec_bool",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<bool> &value) -> Operator & {
              int len = value.size();
              std::vector<bool> tar;
@@ -444,6 +576,15 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_tensor",
+           (Operator & (Operator::*)(const char *, const Tensor &)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_tensor",
+           (Operator &
+            (Operator::*)(const char *, const std::vector<Tensor> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_tensor",
            (Operator & (Operator::*)(const std::string &, const Tensor &)) &
                Operator::SetAttr)
@@ -451,8 +592,9 @@ void BindAscendGraph(py::module *m) {
            (Operator &
             (Operator::*)(const std::string &, const std::vector<Tensor> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_vec_uint8",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<uint8_t> &value) -> Operator & {
              int len = value.size();
              std::vector<uint8_t> tar;
@@ -463,13 +605,21 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_vec_vec_int64",
+           (Operator &
+            (Operator::*)(const char *,
+                          const std::vector<std::vector<int64_t>> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_vec_vec_int64",
            (Operator &
             (Operator::*)(const std::string &,
                           const std::vector<std::vector<int64_t>> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_vec_dtype",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<DataType> &value) -> Operator & {
              int len = value.size();
              std::vector<ge::DataType> tar;
@@ -481,15 +631,13 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_dtype",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const DataType &value) -> Operator & {
              ge::DataType tar = (ge::DataType)value;
              return op.SetAttr(name, tar);
            })
-
       .def("get_attr",
-           [](Operator &op, const std::string &name,
-              AttrType type) -> py::tuple {
+           [](Operator &op, const char *name, AttrType type) -> py::tuple {
              graphStatus res = -1;
              switch (type) {
                case AT_INT64: {
@@ -538,12 +686,12 @@ void BindAscendGraph(py::module *m) {
                  return py::make_tuple(o_av, res);
                } break;
                case AT_STRING: {
-                 std::string s_av;
+                 AscendString s_av;
                  res = op.GetAttr(name, s_av);
                  return py::make_tuple(s_av, res);
                } break;
                case AT_LIST_STRING: {
-                 std::vector<std::string> v_s_av;
+                 std::vector<AscendString> v_s_av;
                  res = op.GetAttr(name, v_s_av);
                  return py::make_tuple(v_s_av, res);
                } break;
@@ -594,11 +742,31 @@ void BindAscendGraph(py::module *m) {
            })
       .def("break_connect", &Operator::BreakConnect)
       .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_subgraph_names",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               std::vector<AscendString> &) const>(&Operator::GetSubgraphNames))
+      .def("get_subgraph_builder",
+           static_cast<ge::SubgraphBuilder (ge::Operator::*)(const char *)
+                           const>(&Operator::GetSubgraphBuilder))
+      .def("get_subgraph",
+           static_cast<ge::Graph (ge::Operator::*)(const char *) const>(
+               &Operator::GetSubgraph))
+      .def("get_dynamic_subgraph_builder",
+           static_cast<ge::SubgraphBuilder (ge::Operator::*)(const char *,
+                                                             uint32_t) const>(
+               &Operator::GetDynamicSubgraphBuilder))
+      .def("get_dynamic_subgraph",
+           static_cast<ge::Graph (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicSubgraph));
+#else
+      .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
       .def("get_subgraph_names", &Operator::GetSubgraphNames)
       .def("get_subgraph_builder", &Operator::GetSubgraphBuilder)
       .def("get_subgraph", &Operator::GetSubgraph)
       .def("get_dynamic_subgraph_builder", &Operator::GetDynamicSubgraphBuilder)
       .def("get_dynamic_subgraph", &Operator::GetDynamicSubgraph);
+#endif
 
   py::class_<Tensor>(*m, "GETensor")
       .def(py::init<>())
@@ -613,10 +781,15 @@ void BindAscendGraph(py::module *m) {
                            Tensor::SetData)
       .def("set_data",
            (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_data",
+           (graphStatus (Tensor::*)(const char *)) & Tensor::SetData)
+#else
       .def("set_data",
            (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData)
+#endif
       .def("set_data",
-           (graphStatus (Tensor::*)(const std::vector<std::string> &)) &
+           (graphStatus (Tensor::*)(const std::vector<AscendString> &)) &
                Tensor::SetData)
 
       .def("get_data",
@@ -638,8 +811,8 @@ void BindAscendGraph(py::module *m) {
       .def(py::init<Shape, Format, DataType>(), py::arg("shape"),
            py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT)
       .def(py::init<const TensorDesc &>())
-      .def("update",
-           (void (TensorDesc::*)(Shape, Format, DataType)) & TensorDesc::Update,
+      .def("update", (void (TensorDesc::*)(const Shape &, Format, DataType)) &
+                         TensorDesc::Update,
            py::arg("shape"), py::arg("format") = FORMAT_ND,
            py::arg("dt") = DT_FLOAT)
       .def("set_shape", &TensorDesc::SetShape)
@@ -660,8 +833,16 @@ void BindAscendGraph(py::module *m) {
       .def("get_origin_format", &TensorDesc::GetOriginFormat)
       .def("set_data_type", &TensorDesc::SetDataType)
       .def("get_data_type", &TensorDesc::GetDataType)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_name", static_cast<void (ge::TensorDesc::*)(const char *)>(
+                           &TensorDesc::SetName))
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::TensorDesc::*)(AscendString &)>(
+               &TensorDesc::GetName))
+#else
       .def("set_name", &TensorDesc::SetName)
       .def("get_name", &TensorDesc::GetName)
+#endif
       .def("set_size", &TensorDesc::SetSize)
       .def("get_size", &TensorDesc::GetSize)
       .def("set_real_dim_cnt", &TensorDesc::SetRealDimCnt)
@@ -679,16 +860,27 @@ void BindAscendGraph(py::module *m) {
   py::class_<AttrValue>(*m, "GEAttrValue").def(py::init<>());
 
   py::class_<OperatorFactory>(*m, "GEOperatorFactory")
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def_static("create_operator",
+                  static_cast<ge::Operator (*)(const char *, const char *)>(
+                      &ge::OperatorFactory::CreateOperator))
+#else
       .def("create_operator", &OperatorFactory::CreateOperator)
+#endif
       .def("get_ops_type_list",
            []() -> py::tuple {
-             std::vector<std::string> all_ops;
+             std::vector<AscendString> all_ops;
              graphStatus status = OperatorFactory::GetOpsTypeList(all_ops);
              return py::make_tuple(all_ops, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def_static("is_exist_op", static_cast<bool (*)(const char *)>(
+                                     &OperatorFactory::IsExistOp));
+#else
       .def("is_exist_op", &OperatorFactory::IsExistOp);
+#endif
 }
 
-}  // end namespace pybind
-}  // end namespace paddle
+}  // namespace pybind
+}  // namespace paddle
 #endif
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h
index 4af96d6ef4b92a..e999080544c31b 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
@@ -25,6 +25,7 @@ namespace pybind {
 
 void BindAscendGraph(py::module* m);
 void BindAscendWrapper(py::module* m);
+void BindAscendDevice(py::module* m);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index ba716fb3b550ac..91461aa26f341a 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -30,8 +30,12 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/communicator_common.h"
 #include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
 #include "paddle/fluid/distributed/service/communicator.h"
 #include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_py_service.h"
 #include "paddle/fluid/distributed/service/heter_client.h"
 
 namespace py = pybind11;
@@ -39,6 +43,11 @@ using paddle::distributed::CommContext;
 using paddle::distributed::Communicator;
 using paddle::distributed::FleetWrapper;
 using paddle::distributed::HeterClient;
+using paddle::distributed::GraphPyService;
+using paddle::distributed::GraphNode;
+using paddle::distributed::GraphPyServer;
+using paddle::distributed::GraphPyClient;
+using paddle::distributed::FeatureNode;
 
 namespace paddle {
 namespace pybind {
@@ -152,5 +161,129 @@ void BindHeterClient(py::module* m) {
       .def("stop", &HeterClient::Stop);
 }
 
+void BindGraphNode(py::module* m) {
+  py::class_<GraphNode>(*m, "GraphNode")
+      .def(py::init<>())
+      .def("get_id", &GraphNode::get_id)
+      .def("get_feature", &GraphNode::get_feature);
+}
+void BindGraphPyFeatureNode(py::module* m) {
+  py::class_<FeatureNode>(*m, "FeatureNode")
+      .def(py::init<>())
+      .def("get_id", &GraphNode::get_id)
+      .def("get_feature", &GraphNode::get_feature);
+}
+
+void BindGraphPyService(py::module* m) {
+  py::class_<GraphPyService>(*m, "GraphPyService").def(py::init<>());
+}
+
+void BindGraphPyServer(py::module* m) {
+  py::class_<GraphPyServer>(*m, "GraphPyServer")
+      .def(py::init<>())
+      .def("start_server", &GraphPyServer::start_server)
+      .def("set_up", &GraphPyServer::set_up)
+      .def("add_table_feat_conf", &GraphPyServer::add_table_feat_conf);
+}
+void BindGraphPyClient(py::module* m) {
+  py::class_<GraphPyClient>(*m, "GraphPyClient")
+      .def(py::init<>())
+      .def("load_edge_file", &GraphPyClient::load_edge_file)
+      .def("load_node_file", &GraphPyClient::load_node_file)
+      .def("set_up", &GraphPyClient::set_up)
+      .def("add_table_feat_conf", &GraphPyClient::add_table_feat_conf)
+      .def("pull_graph_list", &GraphPyClient::pull_graph_list)
+      .def("start_client", &GraphPyClient::start_client)
+      .def("batch_sample_neighboors", &GraphPyClient::batch_sample_neighboors)
+      .def("random_sample_nodes", &GraphPyClient::random_sample_nodes)
+      .def("stop_server", &GraphPyClient::stop_server)
+      .def("get_node_feat",
+           [](GraphPyClient& self, std::string node_type,
+              std::vector<uint64_t> node_ids,
+              std::vector<std::string> feature_names) {
+             auto feats =
+                 self.get_node_feat(node_type, node_ids, feature_names);
+             std::vector<std::vector<py::bytes>> bytes_feats(feats.size());
+             for (int i = 0; i < feats.size(); ++i) {
+               for (int j = 0; j < feats[i].size(); ++j) {
+                 bytes_feats[i].push_back(py::bytes(feats[i][j]));
+               }
+             }
+             return bytes_feats;
+           })
+      .def("bind_local_server", &GraphPyClient::bind_local_server);
+}
+
+using paddle::distributed::TreeIndex;
+using paddle::distributed::IndexWrapper;
+using paddle::distributed::IndexNode;
+
+void BindIndexNode(py::module* m) {
+  py::class_<IndexNode>(*m, "IndexNode")
+      .def(py::init<>())
+      .def("id", [](IndexNode& self) { return self.id(); })
+      .def("is_leaf", [](IndexNode& self) { return self.is_leaf(); })
+      .def("probability", [](IndexNode& self) { return self.probability(); });
+}
+
+void BindTreeIndex(py::module* m) {
+  py::class_<TreeIndex, std::shared_ptr<TreeIndex>>(*m, "TreeIndex")
+      .def(py::init([](const std::string name, const std::string path) {
+        auto index_wrapper = IndexWrapper::GetInstancePtr();
+        index_wrapper->insert_tree_index(name, path);
+        return index_wrapper->get_tree_index(name);
+      }))
+      .def("height", [](TreeIndex& self) { return self.Height(); })
+      .def("branch", [](TreeIndex& self) { return self.Branch(); })
+      .def("total_node_nums",
+           [](TreeIndex& self) { return self.TotalNodeNums(); })
+      .def("emb_size", [](TreeIndex& self) { return self.EmbSize(); })
+      .def("get_all_leafs", [](TreeIndex& self) { return self.GetAllLeafs(); })
+      .def("get_nodes",
+           [](TreeIndex& self, const std::vector<uint64_t>& codes) {
+             return self.GetNodes(codes);
+           })
+      .def("get_layer_codes",
+           [](TreeIndex& self, int level) { return self.GetLayerCodes(level); })
+      .def("get_ancestor_codes",
+           [](TreeIndex& self, const std::vector<uint64_t>& ids, int level) {
+             return self.GetAncestorCodes(ids, level);
+           })
+      .def("get_children_codes",
+           [](TreeIndex& self, uint64_t ancestor, int level) {
+             return self.GetChildrenCodes(ancestor, level);
+           })
+      .def("get_travel_codes",
+           [](TreeIndex& self, uint64_t id, int start_level) {
+             return self.GetTravelCodes(id, start_level);
+           });
+}
+
+void BindIndexWrapper(py::module* m) {
+  py::class_<IndexWrapper, std::shared_ptr<IndexWrapper>>(*m, "IndexWrapper")
+      .def(py::init([]() { return IndexWrapper::GetInstancePtr(); }))
+      .def("insert_tree_index", &IndexWrapper::insert_tree_index)
+      .def("get_tree_index", &IndexWrapper::get_tree_index)
+      .def("clear_tree", &IndexWrapper::clear_tree);
+}
+
+using paddle::distributed::IndexSampler;
+using paddle::distributed::LayerWiseSampler;
+
+void BindIndexSampler(py::module* m) {
+  py::class_<IndexSampler, std::shared_ptr<IndexSampler>>(*m, "IndexSampler")
+      .def(py::init([](const std::string& mode, const std::string& name) {
+        if (mode == "by_layerwise") {
+          return IndexSampler::Init<LayerWiseSampler>(name);
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Unsupported IndexSampler Type!"));
+        }
+      }))
+      .def("init_layerwise_conf", &IndexSampler::init_layerwise_conf)
+      .def("init_beamsearch_conf", &IndexSampler::init_beamsearch_conf)
+      .def("sample", &IndexSampler::sample);
+}
+
 }  // end namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h
index 7f471598ad2818..206a69f5a80197 100644
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -27,6 +27,14 @@ void BindPSHost(py::module* m);
 void BindCommunicatorContext(py::module* m);
 void BindDistCommunicator(py::module* m);
 void BindHeterClient(py::module* m);
-
+void BindGraphNode(py::module* m);
+void BindGraphPyService(py::module* m);
+void BindGraphPyFeatureNode(py::module* m);
+void BindGraphPyServer(py::module* m);
+void BindGraphPyClient(py::module* m);
+void BindIndexNode(py::module* m);
+void BindTreeIndex(py::module* m);
+void BindIndexWrapper(py::module* m);
+void BindIndexSampler(py::module* m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index e8ba16398d2b00..bc8d1e5b40585d 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -88,10 +88,17 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 // others
 DECLARE_bool(sync_nccl_allreduce);
 #endif
+
 #ifdef PADDLE_WITH_XPU
 // device management
 DECLARE_string(selected_xpus);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+// device management
+DECLARE_string(selected_npus);
+#endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
 DECLARE_int32(rpc_send_thread_num);
 DECLARE_int32(rpc_get_thread_num);
@@ -374,6 +381,11 @@ static void RegisterGlobalVarGetterSetter() {
 #ifdef PADDLE_WITH_XPU
   REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
+#endif
+
 #ifdef PADDLE_WITH_DITRIBUTE
   REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
                              FLAGS_rpc_get_thread_num,
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 58ef177863093d..0817dc33671621 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -34,10 +34,12 @@ limitations under the License. */
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/bkcl_context.h"
 #include "paddle/fluid/imperative/data_loader.h"
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/imperative/partial_grad_engine.h"
 #include "paddle/fluid/imperative/profiler.h"
+#include "paddle/fluid/imperative/py_layer_fwd.h"
 #include "paddle/fluid/imperative/reducer.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -63,6 +65,65 @@ class Layer : public imperative::Layer {
   }
 };
 
+template <typename T>
+static T PyObjectCast(PyObject *obj) {
+  try {
+    return py::cast<T>(py::handle(obj));
+  } catch (py::cast_error &) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Python object is not type of %s", typeid(T).name()));
+  }
+}
+
+class PyVariableWrapperHook : public imperative::VariableWrapperHook {
+ public:
+  explicit PyVariableWrapperHook(PyObject *func) : py_func_(func) {
+    Py_INCREF(py_func_);
+  }
+
+  ~PyVariableWrapperHook() {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(py_func_);
+  }
+
+  std::shared_ptr<imperative::VariableWrapper> operator()(
+      const std::shared_ptr<imperative::VariableWrapper> &var) override {
+    py::gil_scoped_acquire gil;
+    VLOG(3) << "Call PyVariableWrapperHook for var " << var->Name();
+
+    // 1. unpack temp VarBase from VariableWrapper
+    std::shared_ptr<imperative::VarBase> tmp_varbase =
+        std::make_shared<imperative::VarBase>(var);
+
+    // 2. call hook and return
+    PyObject *res = nullptr;
+    try {
+      res = PyObject_CallFunctionObjArgs(py_func_, py::cast(tmp_varbase).ptr(),
+                                         nullptr);
+    } catch (platform::EnforceNotMet &e) {
+      throw std::move(e);
+    } catch (std::exception &e) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Hook function of Tensor raises an exception: %s.", e.what()));
+    } catch (...) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Hook function of Tensor raises an unknown exception."));
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(res,
+                            platform::errors::Unavailable(
+                                "Hook function of Tensor return a nullptr."));
+    if (res == Py_None) {
+      return var;
+    }
+
+    return PyObjectCast<std::shared_ptr<imperative::VarBase>>(res)->SharedVar();
+  }
+
+ private:
+  PyObject *py_func_;
+};
+
 static const platform::Place PyObjectToPlace(const py::object &place_obj) {
   if (py::isinstance<platform::CPUPlace>(place_obj)) {
     return place_obj.cast<platform::CPUPlace>();
@@ -213,16 +274,6 @@ static std::string GetTypeName(const imperative::VarBase &var) {
 
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 
-template <typename T>
-static T PyObjectCast(PyObject *obj) {
-  try {
-    return py::cast<T>(py::handle(obj));
-  } catch (py::cast_error &) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Python object is not type of %s", typeid(T).name()));
-  }
-}
-
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
 // Unlike py::object, py::handle does not change reference count of PyObject *.
 static std::vector<std::shared_ptr<imperative::VarBase>>
@@ -494,6 +545,39 @@ void BindImperative(py::module *m_ptr) {
       },
       py::return_value_policy::take_ownership);
 
+  m.def("_array_to_share_memory_tensor",
+        [](py::object &obj) {
+          // 1. cast to python array
+          auto array = obj.cast<py::array>();
+          PADDLE_ENFORCE_NE(
+              string::Sprintf("%s", array.dtype()).compare("object"), 0,
+              platform::errors::InvalidArgument(
+                  "Faild to convert input data to a regular ndarray.\n  * "
+                  "Usually this means the input data contains nested "
+                  "lists with different lengths.\n  * Check the reader "
+                  "function passed to 'set_(sample/sample_list/batch)"
+                  "_generator' to locate the data causes this issue."));
+          // 2. construcct LoDTensor
+          framework::LoDTensor t;
+          SetTensorFromPyArray<platform::CPUPlace>(&t, array,
+                                                   platform::CPUPlace(), true);
+          // 3. allocate shared memory
+          void *data_ptr = t.data<void>();
+          size_t data_size = t.numel() * framework::SizeOfType(t.type());
+          auto shared_writer_holder =
+              memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
+          // 4. maintain mmap fd set & backup ipc_name
+          const std::string &ipc_name = shared_writer_holder->ipc_name();
+          memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+          // 5. copy data & reset holder
+          memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                       platform::CPUPlace(), data_ptr, data_size);
+          t.ResetHolder(shared_writer_holder);
+
+          return t;
+        },
+        py::return_value_policy::take_ownership);
+
   m.def("_remove_tensor_list_mmap_fds", [](py::list &tensor_list) {
     for (size_t i = 0; i < tensor_list.size(); ++i) {
       auto t = tensor_list[i].cast<framework::LoDTensor>();
@@ -611,15 +695,17 @@ void BindImperative(py::module *m_ptr) {
              // TODO(liym27): Try not to call TensorToPyArray because it always
              // copys data to cpu place, which reduces performance.
              if (parse_index && value_is_tensor) {
-               std::vector<int> axes, starts, ends, steps, decrease_axis,
+               std::vector<int> axes, starts, ends, steps, decrease_axes,
                    infer_flags;
                ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends,
-                                  &steps, &decrease_axis, &infer_flags);
+                                  &steps, &decrease_axes, &infer_flags);
 
-               framework::AttributeMap attrs = {{"axes", axes},
-                                                {"starts", starts},
-                                                {"ends", ends},
-                                                {"steps", steps}};
+               framework::AttributeMap attrs = {
+                   {"axes", axes},
+                   {"starts", starts},
+                   {"ends", ends},
+                   {"steps", steps},
+                   {"decrease_axes", decrease_axes}};
 
                imperative::NameVarBaseMap ins = {{"Input", {self}}};
                imperative::NameVarBaseMap outs = {{"Out", {self}}};
@@ -720,6 +806,7 @@ void BindImperative(py::module *m_ptr) {
          Bump the version whenever the Tensor is modified through an inplace operation.
             )DOC")
       .def("numpy",
+
            [](imperative::VarBase &self) -> py::array {
              const auto &tensor =
                  self.MutableVar()->Get<framework::LoDTensor>();
@@ -918,18 +1005,6 @@ void BindImperative(py::module *m_ptr) {
               print(x.stop_gradient) # True
               print(x.grad)          # None
        )DOC")
-      .def("_run_backward",
-           [](imperative::VarBase &self, const imperative::Tracer &tracer,
-              bool retain_graph) {
-             // TODO(jiabin): when we impl more backward execution we can
-             // select them
-             auto *engine = tracer.GetEngine();
-             engine->Init(&self, retain_graph);
-             VLOG(3) << "Start backward";
-             engine->Execute();
-             VLOG(3) << "Finish backward";
-           },
-           py::call_guard<py::gil_scoped_release>())
       .def("_grad_name", &imperative::VarBase::GradVarName)
       .def("_grad_value",
            [](imperative::VarBase &self) {
@@ -958,6 +1033,10 @@ void BindImperative(py::module *m_ptr) {
              return std::shared_ptr<imperative::VarBase>(nullptr);
            },
            py::return_value_policy::copy)
+      .def("_set_grad_ivar",
+           [](imperative::VarBase &self, imperative::VarBase &grad) {
+             self.SetGradVarBase(grad);
+           })
       .def("_is_sparse",
            [](imperative::VarBase &self) {
              return self.Var().IsType<framework::SelectedRows>();
@@ -988,6 +1067,61 @@ void BindImperative(py::module *m_ptr) {
              }
            },
            py::call_guard<py::gil_scoped_release>())
+      .def("_register_grad_hook",
+           [](imperative::VarBase &self, const py::handle &hook) {
+             PADDLE_ENFORCE_EQ(
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot register gradient hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             return self.GradVarBase()->AddVariableWrapperHook(
+                 std::make_shared<PyVariableWrapperHook>(hook.ptr()));
+           })
+      .def("_remove_grad_hook",
+           [](imperative::VarBase &self, int64_t hook_id) {
+             PADDLE_ENFORCE_EQ(
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot remove gradient hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             return self.GradVarBase()->RemoveVariableWrapperHook(hook_id);
+           })
+      .def("_register_backward_hook",
+           [](imperative::VarBase &self, const py::handle &hook) {
+             PADDLE_ENFORCE_EQ(
+                 self.IsLeaf(), true,
+                 platform::errors::InvalidArgument(
+                     "Only can register backward hook for leaf Tensor."));
+             PADDLE_ENFORCE_EQ(
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot register backward hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             auto py_func = PyObjectCast<std::function<void()>>(hook.ptr());
+             self.GradVarBase()->AddVoidHook(
+                 std::make_shared<std::function<void()>>(py_func));
+           },
+           R"DOC(
+             Registers a backward hook for current Tensor.
+
+             This hook will be called every time the gradient of current Tensor has been fully calculated.
+
+             There are two differences with `_register_grad_hook`:
+             1. This backward hook will be executed after the gradient accumulation completed across batchs,
+                but the hook registered by `_register_grad_hook` will be executed the gradient accumulation
+                completed in current batch.
+             2. This backward hook function should have the following signature:
+
+                  hook() -> None
+
+                It requires no input and no return value.
+
+             Args:
+                 hook(function): A backward hook to be registered for Tensor.gradient
+
+             Returns:
+                 None
+           )DOC")
       .def("cpu",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              if (platform::is_cpu_place(self->Place())) {
@@ -1109,6 +1243,35 @@ void BindImperative(py::module *m_ptr) {
               y = x.cuda(1)
               print(y.place)        # CUDAPlace(1)
        )DOC")
+      .def("_share_memory",
+           [](const std::shared_ptr<imperative::VarBase> &self) {
+#ifndef _WIN32
+             PADDLE_ENFORCE_EQ(
+                 platform::is_cpu_place(self->Place()), true,
+                 platform::errors::InvalidArgument(
+                     "Sharing memory only support CPU Tensor currently"));
+             // 1. get LoDTensor
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             // 2. allocate shared memory
+             void *data_ptr = t->data<void>();
+             size_t data_size = t->numel() * framework::SizeOfType(t->type());
+             auto shared_writer_holder =
+                 memory::allocation::AllocateMemoryMapWriterAllocation(
+                     data_size);
+             // 3. maintain mmap fd set & backup ipc_name
+             const std::string &ipc_name = shared_writer_holder->ipc_name();
+             memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+             // 4. copy data & reset holder
+             memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                          platform::CPUPlace(), data_ptr, data_size);
+             t->ResetHolder(shared_writer_holder);
+             return *t;
+#else
+             PADDLE_THROW(platform::errors::PermissionDenied(
+                 "Sharing memory in Windows OS is not supported currently"));
+#endif
+           },
+           py::return_value_policy::reference)
       .def("copy_", &imperative::VarBase::CopyFrom)
       .def("_copy_to",
            [](const std::shared_ptr<imperative::VarBase> &self,
@@ -1158,6 +1321,16 @@ void BindImperative(py::module *m_ptr) {
              return new_var;
            },
            py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::Place &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
+           py::return_value_policy::copy)
       .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
            py::return_value_policy::reference)
       .def_property("name", &imperative::VarBase::Name,
@@ -1167,22 +1340,28 @@ void BindImperative(py::module *m_ptr) {
                     &imperative::VarBase::SetOverridedStopGradient)
       .def_property("persistable", &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
-      .def_property_readonly(
-          "shape",
-          [](imperative::VarBase &self) {
-            if (self.Var().IsType<framework::LoDTensor>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::LoDTensor>().dims());
-            } else if (self.Var().IsType<framework::SelectedRows>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::SelectedRows>().value().dims());
-            } else {
-              VLOG(2) << "It is meaningless to get shape of "
-                         "variable type "
-                      << GetTypeName(self);
-              return std::vector<int>();
-            }
-          })
+      .def_property_readonly("shape",
+                             [](imperative::VarBase &self) {
+                               if (self.Var().IsType<framework::LoDTensor>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::LoDTensor>()
+                                         .dims());
+                               } else if (self.Var()
+                                              .IsType<
+                                                  framework::SelectedRows>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::SelectedRows>()
+                                         .value()
+                                         .dims());
+                               } else {
+                                 VLOG(2) << "It is meaningless to get shape of "
+                                            "variable type "
+                                         << GetTypeName(self);
+                                 return std::vector<int>();
+                               }
+                             })
       .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.
@@ -1412,6 +1591,19 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
+  m.def(
+      "dygraph_run_backward",
+      [](const std::vector<std::shared_ptr<imperative::VarBase>> &tensors,
+         const std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensors,
+         bool retain_graph, const imperative::Tracer &tracer) {
+        auto *engine = tracer.GetEngine();
+        engine->Init(tensors, grad_tensors, retain_graph);
+        VLOG(3) << "Start backward";
+        engine->Execute();
+        VLOG(3) << "Finish backward";
+      },
+      py::call_guard<py::gil_scoped_release>());
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
   py::class_<imperative::ParallelContext,
@@ -1441,7 +1633,10 @@ void BindImperative(py::module *m_ptr) {
       m, "NCCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
                     const platform::CUDAPlace &>())
-      .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); });
+      .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); })
+      .def("init_with_ring_id",
+           &imperative::NCCLParallelContext::InitWithRingID,
+           py::arg("ring_id"));
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -1450,8 +1645,34 @@ void BindImperative(py::module *m_ptr) {
       m, "BKCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
                     const platform::XPUPlace &>())
-      .def("init", [](imperative::BKCLParallelContext &self) { self.Init(); });
+      .def("init", [](imperative::BKCLParallelContext &self) { self.Init(); })
+      .def("init_with_ring_id",
+           &imperative::BKCLParallelContext::InitWithRingID,
+           py::arg("ring_id"));
 #endif
+  m.def("pylayer_apply",
+        [](const platform::CPUPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::CUDAPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::XPUPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::CUDAPinnedPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 69856fa4fa142e..2c1927f49f6b70 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -16,6 +16,9 @@
 #include <fstream>
 #include <iostream>
 #include <string>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
 
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -23,6 +26,9 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#endif
 
 // NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are
 // determined by the OP`s proto automatically, i.e., all the inputs registered
@@ -119,6 +125,8 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"fill_constant", {"Out"}},
     {"matmul", {"Out"}},
     {"c_broadcast", {"Out"}},
+    {"c_sync_calc_stream", {"Out"}},
+    {"c_sync_comm_stream", {"Out"}},
     {"c_allreduce_sum", {"Out"}},
     {"c_allreduce_max", {"Out"}},
     {"c_allreduce_min", {"Out"}},
@@ -559,6 +567,11 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 
+#ifdef PADDLE_WITH_ASCEND
+  auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
+  ascend_ptr->InitGEForUT();
+#endif
+
   std::vector<std::string> headers{"\"paddle/fluid/imperative/tracer.h\""};
 
   std::ofstream out(argv[1], std::ios::out);
@@ -588,5 +601,9 @@ int main(int argc, char* argv[]) {
       << "} // namespace paddle\n";
 
   out.close();
+
+#ifdef PADDLE_WITH_ASCEND
+  ge::GEFinalize();
+#endif
   return 0;
 }
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 5bff9178fdfa5b..0c239f8157e5df 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -32,8 +32,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 void BindPSGPUWrapper(py::module* m) {
   py::class_<framework::PSGPUWrapper, std::shared_ptr<framework::PSGPUWrapper>>(
       *m, "PSGPU")
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.h b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
index 8bd6ee13cf50bd..ba4f146389ed3e 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
@@ -22,8 +22,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 void BindPSGPUWrapper(py::module* m);
 #endif
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e1ff69e7485eb8..39a78d86976ae9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -14,11 +14,15 @@ limitations under the License. */
 #include <Python.h>
 
 #include <algorithm>
+#include <cctype>
 #include <cstdlib>
+#include <iterator>
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT // for call_once
 #include <string>
+#include <tuple>
+#include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -33,7 +37,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
-#include "paddle/fluid/framework/load_op_lib.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -104,6 +107,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
@@ -144,6 +151,14 @@ bool IsCompiledWithROCM() {
 #endif
 }
 
+bool IsCompiledWithAscend() {
+#ifndef PADDLE_WITH_ASCEND
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithXPU() {
 #ifndef PADDLE_WITH_XPU
   return false;
@@ -152,6 +167,14 @@ bool IsCompiledWithXPU() {
 #endif
 }
 
+bool IsCompiledWithNPU() {
+#ifndef PADDLE_WITH_ASCEND_CL
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
   return false;
@@ -182,6 +205,64 @@ bool SupportsBfloat16FastPerformance() {
 #endif
 }
 
+// According to the input `place` and `dtype`, this function returns a tuple
+// consists of three sets:
+// 1) All operators registered in the Paddle framework.
+// 2) All operators supported for `place` and `dtype`.
+// 3) All operators unsupported for `place` and `dtype`.
+// The input `place` is a type of string, which can only be `GPU` or `CPU`.
+// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
+// which can be paddle::framework::proto::VarType::FP16,
+// paddle::framework::proto::VarType::FP32 and so on.
+std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
+           std::unordered_set<std::string>>
+OpSupportedInfos(const std::string &place,
+                 framework::proto::VarType::Type dtype) {
+  std::string query_place;
+  std::transform(place.begin(), place.end(), std::back_inserter(query_place),
+                 [](unsigned char c) { return std::toupper(c); });
+  using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
+  std::unordered_map<std::string, fn_type> is_target_place{
+      {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
+  };
+  PADDLE_ENFORCE_NE(
+      is_target_place.count(query_place), 0,
+      platform::errors::InvalidArgument(
+          "The argument `place` should be 'GPU' or 'CPU', but get '%s'.",
+          place));
+
+  std::unordered_set<std::string> all_ops;
+  const auto &op_info = framework::OpInfoMap::Instance().map();
+  for (auto it = op_info.begin(); it != op_info.end(); it++) {
+    all_ops.emplace(it->first);
+  }
+
+  std::unordered_set<std::string> supported_ops;
+  auto &all_kernels = framework::OperatorWithKernel::AllOpKernels();
+  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
+    for (auto &kernel_type : it->second) {
+      if (is_target_place[query_place](kernel_type.first.place_) &&
+          kernel_type.first.data_type_ == dtype) {
+        supported_ops.emplace(it->first);
+      }
+    }
+  }
+
+  std::unordered_set<std::string> unsupported_ops;
+  for (auto &op : all_ops) {
+    if (!supported_ops.count(op)) {
+      unsupported_ops.emplace(op);
+    }
+  }
+
+  VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
+  VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
+  VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
+          << " --";
+  return std::make_tuple(std::move(all_ops), std::move(supported_ops),
+                         std::move(unsupported_ops));
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
@@ -500,6 +581,11 @@ PYBIND11_MODULE(core_noavx, m) {
         make_ddim(x_dim), make_ddim(y_dim), -1));
   });
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  m.def("_npu_finalize",
+        []() { platform::AclInstance::Instance().Finalize(); });
+#endif
+
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
@@ -572,6 +658,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::NPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("_alloc_double",
            [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<double>(place);
@@ -619,12 +709,19 @@ PYBIND11_MODULE(core_noavx, m) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
       .def("_clear", &framework::Tensor::clear)
+      .def("_mutable_data",
+           [](framework::Tensor &self, paddle::platform::NPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
            R"DOC(
@@ -632,7 +729,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the 
+          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1360,6 +1457,18 @@ All parameter, weight, gradient are variables in Paddle.
                     return new paddle::platform::XPUDeviceContext(place);
 #endif
                   })
+        .def_static("create",
+                    [](paddle::platform::NPUPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_ASCEND_CL
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use NPUPlace in CPU/GPU/XPU version, "
+                 "Please recompile or reinstall Paddle with NPU support."));
+#else
+                return new paddle::platform::NPUDeviceContext(place);
+#endif
+        })
       .def_static("create",
                   [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
@@ -1460,6 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
       .def("_get_device_id",
@@ -1529,6 +1639,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_XPU
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
 #endif
+
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
     CPUPlace is a descriptor of a device.
     It represents a CPU device on which a tensor will be allocated and a model will run.
@@ -1544,6 +1655,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1581,6 +1693,8 @@ All parameter, weight, gradient are variables in Paddle.
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1588,6 +1702,65 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
+  // NPUPlace
+  py::class_<platform::NPUPlace>(m, "NPUPlace", R"DOC(
+    NPUPlace is a descriptor of a device.
+    It represents a NPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          npu_place = paddle.NPUPlace(0)
+
+        )DOC")
+      .def("__init__",
+           [](platform::NPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_ASCEND_CL
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid NPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
+               if (platform::GetNPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetNPUDeviceCount(),
+                     platform::GetNPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::NPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use NPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use NPU, please try to install NPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change NPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
+      .def("__str__", string::to_string<const platform::NPUPlace &>);
+
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
       .def("_type", &PlaceIndex<platform::Place>)
@@ -1595,6 +1768,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
@@ -1602,6 +1776,8 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) { return platform::is_cpu_place(self); })
       .def("is_xpu_place",
            [](platform::Place &self) { return platform::is_xpu_place(self); })
+      .def("is_npu_place",
+           [](platform::Place &self) { return platform::is_npu_place(self); })
       .def("is_cuda_pinned_place",
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
@@ -1614,6 +1790,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::XPUPlace, self).device;
            })
+      .def("npu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::NPUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
@@ -1633,6 +1813,10 @@ All parameter, weight, gradient are variables in Paddle.
               const platform::CUDAPinnedPlace &cuda_pinned_place) {
              self = cuda_pinned_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::NPUPlace &npu_place) {
+             self = npu_place;
+           })
       .def("__repr__", string::to_string<const platform::Place &>)
       .def("__str__", string::to_string<const platform::Place &>);
 
@@ -1657,6 +1841,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::XPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::NPUPlace &place) { self.Run(scope, place); })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
@@ -1752,17 +1939,19 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
-  m.def("load_op_library", framework::LoadOpLib);
   m.def("load_op_meta_info_and_register_op",
         framework::LoadOpMetaInfoAndRegisterOp);
   m.def("init_devices", []() { framework::InitDevices(); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
+  m.def("is_compiled_with_npu", IsCompiledWithNPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
+  m.def("op_supported_infos", OpSupportedInfos);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
   m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
@@ -2863,8 +3052,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_PSLIB
   BindHeterWrapper(&m);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   BindPSGPUWrapper(&m);
 #endif
   BindGlooWrapper(&m);
@@ -2887,6 +3075,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_ASCEND
   BindAscendWrapper(&m);
   BindAscendGraph(&m);
+  BindAscendDevice(&m);
 #endif
 #ifdef PADDLE_WITH_CRYPTO
   BindCrypto(&m);
@@ -2898,6 +3087,16 @@ All parameter, weight, gradient are variables in Paddle.
   BindCommunicatorContext(&m);
   BindDistCommunicator(&m);
   BindHeterClient(&m);
+  BindGraphPyFeatureNode(&m);
+  BindGraphNode(&m);
+  BindGraphPyService(&m);
+  BindGraphPyServer(&m);
+  BindGraphPyClient(&m);
+  BindIndexNode(&m);
+  BindTreeIndex(&m);
+  BindIndexWrapper(&m);
+  BindIndexSampler(&m);
+
 #endif
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 5f25217007017c..ab1dd8a180b5b6 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -294,6 +294,22 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use XPUPlace in CPU/GPU version, "
         "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (paddle::platform::is_npu_place(place)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::Place tmp_place = place;
+    platform::NPUDeviceGuard guard(
+        BOOST_GET_CONST(platform::NPUPlace, tmp_place).device);
+    auto dst = self->mutable_data<T>(place);
+    platform::NPUMemcpySync(dst, array.data(), array.nbytes(),
+                            ACL_MEMCPY_HOST_TO_DEVICE);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    ctx.Wait();
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use NPUPlace in CPU/GPU/XPU version. "
+        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
deleted file mode 100644
index 0688c63cac3f3f..00000000000000
--- a/paddle/fluid/train/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-function(train_test TARGET_NAME)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (NOT APPLE AND NOT WIN32)
-        cc_test(test_train_${TARGET_NAME}
-                SRCS test_train_${TARGET_NAME}.cc
-                DEPS paddle_inference_shared
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-    else()
-        cc_test(test_train_${TARGET_NAME}
-                SRCS test_train_${TARGET_NAME}.cc
-                DEPS paddle_inference_io
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-    endif()
-    if(TEST test_train_${TARGET_NAME})
-        set_tests_properties(test_train_${TARGET_NAME}
-                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
-        if(NOT WIN32 AND NOT APPLE)
-            set_tests_properties(test_train_${TARGET_NAME}
-                    PROPERTIES TIMEOUT 150)
-        endif()
-    endif()
-endfunction(train_test)
-
-
-if(WITH_TESTING)
-  train_test(recognize_digits)
-endif()
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
deleted file mode 100644
index 95da77d68d482a..00000000000000
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-include_directories("${PADDLE_LIB}/third_party/threadpool")
-include_directories("${PADDLE_LIB}/third_party/dlpack")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  add_definitions(-DPADDLE_WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-        ${MACOS_LD_FLAGS}
-        ${ARCHIVE_START}
-        ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference.so
-        ${ARCHIVE_END}
-        ${MATH_LIB}
-        ${MKLDNN_LIB}
-        glog gflags protobuf z xxhash
-        ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
deleted file mode 100644
index 8a44c25aea9a0d..00000000000000
--- a/paddle/fluid/train/demo/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-
-### step 1. build paddle lib
-
-```
-
-# WITH_MKL=ON|OFF
-# WITH_MKLDNN=ON|OFF
-
-PADDLE_LIB=/paddle/lib/dir
-cmake .. -DPADDLE_INSTALL_DIR=$PADDLE_LIB \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DWITH_GPU=OFF \
-         -DWITH_STYLE_CHECK=OFF \
-         -DWITH_MKL=OFF \
-         -DWITH_MKLDNN=OFF
-make -j8
-make -j8 fluid_lib_dist
-```
-
-### step 2. generate program desc
-```
-# please install paddle before run this scripe
-pip install --upgrade paddlepaddle-*.whl
-python demo_network.py
-```
-
-This will generate two program desc files:
-  - startup_program: used to init all parameters
-  - main_program: main logic of the network
-
-### step 3. build demo_trainer and run it.
-
-
-```
-# Make a build dir at the same dir of this README.md document.
-# The demo dir can be put anywhere.
-mkdir build
-cd build
-
-# WITH_MKL=ON|OFF
-# WITH_MKLDNN=ON|OFF
-PADDLE_LIB=/paddle/lib/dir
-
-# PADDLE_LIB is the same with PADDLE_INSTALL_DIR when building the lib
-cmake .. -DPADDLE_LIB=$PADDLE_LIB \
-         -DWITH_MKLDNN=OFF \
-         -DWITH_MKL=OFF
-make
-
-# copy startup_program and main_program to this dir
-cp ../startup_program .
-cp ../main_program .
-
-# run demo cpp trainer
-./demo_trainer
-
-```
-
-The output will be:
-```
-step: 0 loss: 1069.02
-step: 1 loss: 1069.02
-step: 2 loss: 1069.02
-....
-```
diff --git a/paddle/fluid/train/demo/demo_network.py b/paddle/fluid/train/demo/demo_network.py
deleted file mode 100644
index 41e98c6a24a750..00000000000000
--- a/paddle/fluid/train/demo/demo_network.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-
-
-def train_network(with_optimize):
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-    if with_optimize:
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.00001)
-        sgd_optimizer.minimize(avg_cost)
-    else:
-        fluid.backward.append_backward(avg_cost)
-
-
-def save_program_desc(network_func):
-    startup_program = framework.Program()
-    train_program = framework.Program()
-
-    with framework.program_guard(train_program, startup_program):
-        network_func(with_optimize=False)
-
-    with open("startup_program", "w") as f:
-        f.write(startup_program.desc.serialize_to_string())
-    with open("main_program", "w") as f:
-        f.write(train_program.desc.serialize_to_string())
-
-
-save_program_desc(train_network)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
deleted file mode 100644
index 830f00b8db1d5c..00000000000000
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE_EQ(
-      fin.is_open(), true,
-      platform::errors::Unavailable("Failed to open file %s.", filename));
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> Load(
-    paddle::framework::Executor* executor, const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main() {
-  paddle::framework::InitDevices();
-
-  const auto cpu_place = paddle::platform::CPUPlace();
-
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program = paddle::train::Load(&executor, "startup_program");
-  auto train_program = paddle::train::Load(&executor, "main_program");
-
-  std::string loss_name = "";
-  for (auto op_desc : train_program->Block(0).AllOps()) {
-    if (op_desc->Type() == "mean") {
-      loss_name = op_desc->Output("Out")[0];
-      break;
-    }
-  }
-
-  PADDLE_ENFORCE_NE(loss_name, "",
-                    platform::errors::NotFound("Loss name is not found."));
-
-  // init all parameters
-  executor.Run(*startup_program, &scope, 0);
-
-  // prepare data
-  auto x_var = scope.Var("x");
-  auto x_tensor = x_var->GetMutable<paddle::framework::LoDTensor>();
-  x_tensor->Resize({2, 13});
-
-  auto x_data = x_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 2 * 13; ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  auto y_var = scope.Var("y");
-  auto y_tensor = y_var->GetMutable<paddle::framework::LoDTensor>();
-  y_tensor->Resize({2, 1});
-  auto y_data = y_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 2 * 1; ++i) {
-    y_data[i] = static_cast<float>(i);
-  }
-
-  auto loss_var = scope.Var(loss_name);
-
-  paddle::platform::ProfilerState pf_state;
-  pf_state = paddle::platform::ProfilerState::kCPU;
-  paddle::platform::EnableProfiler(pf_state);
-  clock_t t1 = clock();
-
-  for (int i = 0; i < 10; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true);
-    std::cout << "step: " << i << " loss: "
-              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
-              << std::endl;
-  }
-
-  clock_t t2 = clock();
-  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
-                                    "run_paddle_op_profiler");
-  std::cout << "run_time = " << t2 - t1 << std::endl;
-  return 0;
-}
diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh
deleted file mode 100755
index 2955e7574daa2d..00000000000000
--- a/paddle/fluid/train/demo/run.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-set -x
-
-PADDLE_ROOT=$1
-TURN_ON_MKL=$2 # use MKL or Openblas
-
-# download models
-function download() {
-    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/main_program
-    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/startup_program
-}
-
-download
-
-# build demo trainer
-paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir
-
-mkdir -p build
-cd build
-rm -rf *
-cmake .. -DPADDLE_LIB=$paddle_install_dir \
-         -DWITH_MKLDNN=$TURN_ON_MKL \
-         -DWITH_MKL=$TURN_ON_MKL
-make
-
-cd ..
-
-# run demo trainer
-build/demo_trainer
diff --git a/paddle/fluid/train/imdb_demo/CMakeLists.txt b/paddle/fluid/train/imdb_demo/CMakeLists.txt
deleted file mode 100644
index e943d6bc78eab0..00000000000000
--- a/paddle/fluid/train/imdb_demo/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_imdb_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-include_directories("${PADDLE_LIB}/third_party/threadpool")
-include_directories("${PADDLE_LIB}/third_party/dlpack")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer save_model.cc demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-	${MACOS_LD_FLAGS}
-	${ARCHIVE_START}
-	${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference.so
-	${ARCHIVE_END}
-	${MATH_LIB}
-	${MKLDNN_LIB}
-	glog gflags protobuf z xxhash
-	${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/imdb_demo/README.md b/paddle/fluid/train/imdb_demo/README.md
deleted file mode 100644
index 28fd66710f80dd..00000000000000
--- a/paddle/fluid/train/imdb_demo/README.md
+++ /dev/null
@@ -1,115 +0,0 @@
-# Train with C++ inference API
-
-What is C++ inference API and how to install it:
-
-see: [PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线](https://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/deploy/inference/index_cn.html)
-
-After downloading the source code of Paddle, you can build your own inference lib:
-
-```shell
-PADDLE_ROOT=./Paddle
-cd Paddle
-mkdir build
-cd build
-cmake -DPADDLE_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_PYTHON=OFF \
-      -DWITH_MKL=OFF \
-      -DWITH_GPU=OFF  \
-      -DON_INFER=ON \
-      ..
-make
-make inference_lib_dist
-```
-
-## IMDB task
-
-see: [IMDB Dataset of 50K Movie Reviews | Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
-
-## Quick Start
-
-### prepare data
-
-```shell
-    wget https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-    tar -zxvf text_classification_data.tar.gz
-```
-### build
-
-```shell
-    mkdir build
-    cd build
-    rm -rf *
-    PADDLE_LIB=path/to/Paddle/build/paddle_install_dir
-    cmake .. -DPADDLE_LIB=$PADDLE_LIB  -DWITH_MKLDNN=OFF -DWITH_MKL=OFF
-    make
-```
-
-### generate program description
-
-```
-    python generate_program.py bow
-```
-
-### run
-
-```shell
-   # After editing train.cfg
-   sh run.sh
-```
-
-## results
-
-Below are training logs on BOW model, the losses go down as expected.
-
-```
-WARNING: Logging before InitGoogleLogging() is written to STDERR
-I0731 22:39:06.974232 10965 demo_trainer.cc:130] Start training...
-I0731 22:39:57.395229 10965 demo_trainer.cc:164] epoch: 0; average loss: 0.405706
-I0731 22:40:50.262344 10965 demo_trainer.cc:164] epoch: 1; average loss: 0.110746
-I0731 22:41:49.731079 10965 demo_trainer.cc:164] epoch: 2; average loss: 0.0475805
-I0731 22:43:31.398355 10965 demo_trainer.cc:164] epoch: 3; average loss: 0.0233249
-I0731 22:44:58.744391 10965 demo_trainer.cc:164] epoch: 4; average loss: 0.00701507
-I0731 22:46:30.451735 10965 demo_trainer.cc:164] epoch: 5; average loss: 0.00258187
-I0731 22:48:14.396687 10965 demo_trainer.cc:164] epoch: 6; average loss: 0.00113157
-I0731 22:49:56.242744 10965 demo_trainer.cc:164] epoch: 7; average loss: 0.000698234
-I0731 22:51:11.585919 10965 demo_trainer.cc:164] epoch: 8; average loss: 0.000510136
-I0731 22:52:50.573947 10965 demo_trainer.cc:164] epoch: 9; average loss: 0.000400932
-I0731 22:54:02.686152 10965 demo_trainer.cc:164] epoch: 10; average loss: 0.000329259
-I0731 22:54:55.233342 10965 demo_trainer.cc:164] epoch: 11; average loss: 0.000278644
-I0731 22:56:15.496256 10965 demo_trainer.cc:164] epoch: 12; average loss: 0.000241055
-I0731 22:57:45.015926 10965 demo_trainer.cc:164] epoch: 13; average loss: 0.000212085
-I0731 22:59:18.419997 10965 demo_trainer.cc:164] epoch: 14; average loss: 0.000189109
-I0731 23:00:15.409077 10965 demo_trainer.cc:164] epoch: 15; average loss: 0.000170465
-I0731 23:01:38.795770 10965 demo_trainer.cc:164] epoch: 16; average loss: 0.000155051
-I0731 23:02:57.289487 10965 demo_trainer.cc:164] epoch: 17; average loss: 0.000142106
-I0731 23:03:48.032507 10965 demo_trainer.cc:164] epoch: 18; average loss: 0.000131089
-I0731 23:04:51.195230 10965 demo_trainer.cc:164] epoch: 19; average loss: 0.000121605
-I0731 23:06:27.008040 10965 demo_trainer.cc:164] epoch: 20; average loss: 0.00011336
-I0731 23:07:56.568284 10965 demo_trainer.cc:164] epoch: 21; average loss: 0.000106129
-I0731 23:09:23.948290 10965 demo_trainer.cc:164] epoch: 22; average loss: 9.97393e-05
-I0731 23:10:56.062590 10965 demo_trainer.cc:164] epoch: 23; average loss: 9.40532e-05
-I0731 23:12:23.014047 10965 demo_trainer.cc:164] epoch: 24; average loss: 8.89622e-05
-I0731 23:13:21.439818 10965 demo_trainer.cc:164] epoch: 25; average loss: 8.43784e-05
-I0731 23:14:56.171597 10965 demo_trainer.cc:164] epoch: 26; average loss: 8.02322e-05
-I0731 23:16:01.513542 10965 demo_trainer.cc:164] epoch: 27; average loss: 7.64629e-05
-I0731 23:17:18.709139 10965 demo_trainer.cc:164] epoch: 28; average loss: 7.30239e-05
-I0731 23:18:41.421555 10965 demo_trainer.cc:164] epoch: 29; average loss: 6.98716e-05
-```
-
-I trained a Bow model and a CNN model on IMDB dataset using the trainer. At the same time, I also trained the same models using traditional Python training methods. 
-Results show that the two methods achieve almost the same dev accuracy:
-
-CNN:
- 
-<img src="https://user-images.githubusercontent.com/23031310/62356234-32217300-b543-11e9-89fd-a07614904a08.png" width="300">
-
-BOW:
-
-<img src="https://user-images.githubusercontent.com/23031310/62356253-39488100-b543-11e9-9fa2-a399fc1119d6.png" width="300">
-
-I also recorded the training speed of the C++ Trainer and the python training methods, C++ trainer is quicker on CNN model: 
-
-<img src="https://user-images.githubusercontent.com/23031310/62356444-af4ce800-b543-11e9-88c8-f3bde1321ea1.png" width="300">
-
-#TODO (mapingshuo): find the reason why C++ trainer is quicker on CNN model than python method.
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
deleted file mode 100644
index 6d3b8e7ca4a840..00000000000000
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "include/save_model.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/dataset_factory.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-#include "gflags/gflags.h"
-
-DEFINE_string(filelist, "train_filelist.txt", "filelist for fluid dataset");
-DEFINE_string(data_proto_desc, "data.proto", "data feed protobuf description");
-DEFINE_string(startup_program_file, "startup_program",
-              "startup program description");
-DEFINE_string(main_program_file, "", "main program description");
-DEFINE_string(loss_name, "mean_0.tmp_0",
-              "loss tensor name in the main program");
-DEFINE_string(save_dir, "cnn_model", "directory to save trained models");
-DEFINE_int32(epoch_num, 30, "number of epochs to run when training");
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE_EQ(
-      fin.is_open(), true,
-      platform::errors::Unavailable("Failed to open file %s.", filename));
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> LoadProgramDesc(
-    const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-bool IsPersistable(const paddle::framework::VarDesc* var) {
-  if (var->Persistable() &&
-      var->GetType() != paddle::framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != paddle::framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != paddle::framework::proto::VarType::RAW) {
-    return true;
-  }
-  return false;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main(int argc, char* argv[]) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
-
-  std::cerr << "filelist: " << FLAGS_filelist << std::endl;
-  std::cerr << "data_proto_desc: " << FLAGS_data_proto_desc << std::endl;
-  std::cerr << "startup_program_file: " << FLAGS_startup_program_file
-            << std::endl;
-  std::cerr << "main_program_file: " << FLAGS_main_program_file << std::endl;
-  std::cerr << "loss_name: " << FLAGS_loss_name << std::endl;
-  std::cerr << "save_dir: " << FLAGS_save_dir << std::endl;
-  std::cerr << "epoch_num: " << FLAGS_epoch_num << std::endl;
-
-  std::string filelist = std::string(FLAGS_filelist);
-  std::vector<std::string> file_vec;
-  std::ifstream fin(filelist);
-  if (fin) {
-    std::string filename;
-    while (fin >> filename) {
-      file_vec.push_back(filename);
-    }
-  }
-  PADDLE_ENFORCE_GE(
-      file_vec.size(), 1,
-      platform::errors::InvalidArgument(
-          "At least one file to train, but received number of file is %d.",
-          file_vec.size()));
-  paddle::framework::InitDevices();
-  const auto cpu_place = paddle::platform::CPUPlace();
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_startup_program_file));
-  auto main_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_main_program_file));
-
-  executor.Run(*startup_program, &scope, 0);
-
-  std::string data_feed_desc_str;
-  paddle::train::ReadBinaryFile(std::string(FLAGS_data_proto_desc),
-                                &data_feed_desc_str);
-  VLOG(3) << "load data feed desc done.";
-  std::unique_ptr<paddle::framework::Dataset> dataset_ptr;
-  dataset_ptr =
-      paddle::framework::DatasetFactory::CreateDataset("MultiSlotDataset");
-  VLOG(3) << "initialize dataset ptr done";
-
-  // find all params
-  std::vector<std::string> param_names;
-  const paddle::framework::BlockDesc& global_block = main_program->Block(0);
-  for (auto* var : global_block.AllVars()) {
-    if (paddle::train::IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
-      param_names.push_back(var->Name());
-    }
-  }
-
-  int epoch_num = FLAGS_epoch_num;
-  std::string loss_name = FLAGS_loss_name;
-  auto loss_var = scope.Var(loss_name);
-
-  LOG(INFO) << "Start training...";
-
-  for (int epoch = 0; epoch < epoch_num; ++epoch) {
-    VLOG(3) << "Epoch:" << epoch;
-    // get reader
-    dataset_ptr->SetFileList(file_vec);
-    VLOG(3) << "set file list done";
-    dataset_ptr->SetThreadNum(1);
-    VLOG(3) << "set thread num done";
-    dataset_ptr->SetDataFeedDesc(data_feed_desc_str);
-    VLOG(3) << "set data feed desc done";
-    dataset_ptr->CreateReaders();
-    const std::vector<paddle::framework::DataFeed*> readers =
-        dataset_ptr->GetReaders();
-    PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Readers num(%d) should be equal to thread num(1).",
-                          readers.size()));
-    readers[0]->SetPlace(paddle::platform::CPUPlace());
-    const std::vector<std::string>& input_feed_names =
-        readers[0]->GetUseSlotAlias();
-    for (auto name : input_feed_names) {
-      readers[0]->AddFeedVar(scope.Var(name), name);
-    }
-    VLOG(3) << "get reader done";
-    readers[0]->Start();
-    VLOG(3) << "start a reader";
-    VLOG(3) << "readers size: " << readers.size();
-
-    int step = 0;
-    std::vector<float> loss_vec;
-
-    while (readers[0]->Next() > 0) {
-      executor.Run(*main_program, &scope, 0, false, true);
-      loss_vec.push_back(
-          loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]);
-    }
-    float average_loss =
-        accumulate(loss_vec.begin(), loss_vec.end(), 0.0) / loss_vec.size();
-
-    LOG(INFO) << "epoch: " << epoch << "; average loss: " << average_loss;
-    dataset_ptr->DestroyReaders();
-
-    // save model
-    std::string save_dir_root = FLAGS_save_dir;
-    std::string save_dir =
-        save_dir_root + "/epoch" + std::to_string(epoch) + ".model";
-    paddle::framework::save_model(main_program, &scope, param_names, save_dir,
-                                  false);
-  }
-}
diff --git a/paddle/fluid/train/imdb_demo/generate_program.py b/paddle/fluid/train/imdb_demo/generate_program.py
deleted file mode 100644
index a12282d94ddf9e..00000000000000
--- a/paddle/fluid/train/imdb_demo/generate_program.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import paddle
-import logging
-import paddle.fluid as fluid
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def load_vocab(filename):
-    vocab = {}
-    with open(filename) as f:
-        wid = 0
-        for line in f:
-            vocab[line.strip()] = wid
-            wid += 1
-    vocab["<unk>"] = len(vocab)
-    return vocab
-
-
-if __name__ == "__main__":
-    vocab = load_vocab('imdb.vocab')
-    dict_dim = len(vocab)
-    model_name = sys.argv[1]
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_batch_size(128)
-    dataset.set_pipe_command("python imdb_reader.py")
-
-    dataset.set_use_var([data, label])
-    desc = dataset.proto_desc
-
-    with open("data.proto", "w") as f:
-        f.write(dataset.desc())
-
-    from nets import *
-    if model_name == 'cnn':
-        logger.info("Generate program description of CNN net")
-        avg_cost, acc, prediction = cnn_net(data, label, dict_dim)
-    elif model_name == 'bow':
-        logger.info("Generate program description of BOW net")
-        avg_cost, acc, prediction = bow_net(data, label, dict_dim)
-    else:
-        logger.error("no such model: " + model_name)
-        exit(0)
-    # optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
-    optimizer.minimize(avg_cost)
-
-    with open(model_name + "_main_program", "wb") as f:
-        f.write(fluid.default_main_program().desc.serialize_to_string())
-
-    with open(model_name + "_startup_program", "wb") as f:
-        f.write(fluid.default_startup_program().desc.serialize_to_string())
diff --git a/paddle/fluid/train/imdb_demo/imdb_reader.py b/paddle/fluid/train/imdb_demo/imdb_reader.py
deleted file mode 100644
index f197c95ec32171..00000000000000
--- a/paddle/fluid/train/imdb_demo/imdb_reader.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import paddle
-import re
-import paddle.fluid.incubate.data_generator as dg
-
-
-class IMDBDataset(dg.MultiSlotDataGenerator):
-    def load_resource(self, dictfile):
-        self._vocab = {}
-        wid = 0
-        with open(dictfile) as f:
-            for line in f:
-                self._vocab[line.strip()] = wid
-                wid += 1
-        self._unk_id = len(self._vocab)
-        self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
-        self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0])
-
-    def get_words_and_label(self, line):
-        send = '|'.join(line.split('|')[:-1]).lower().replace("<br />",
-                                                              " ").strip()
-        label = [int(line.split('|')[-1])]
-
-        words = [x for x in self._pattern.split(send) if x and x != " "]
-        feas = [
-            self._vocab[x] if x in self._vocab else self._unk_id for x in words
-        ]
-        return feas, label
-
-    def infer_reader(self, infer_filelist, batch, buf_size):
-        def local_iter():
-            for fname in infer_filelist:
-                with open(fname, "r") as fin:
-                    for line in fin:
-                        feas, label = self.get_words_and_label(line)
-                        yield feas, label
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    def generate_sample(self, line):
-        def memory_iter():
-            for i in range(1000):
-                yield self.return_value
-
-        def data_iter():
-            feas, label = self.get_words_and_label(line)
-            yield ("words", feas), ("label", label)
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    imdb = IMDBDataset()
-    imdb.load_resource("imdb.vocab")
-    imdb.run_from_stdin()
diff --git a/paddle/fluid/train/imdb_demo/include/save_model.h b/paddle/fluid/train/imdb_demo/include/save_model.h
deleted file mode 100644
index 452052866855d2..00000000000000
--- a/paddle/fluid/train/imdb_demo/include/save_model.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-void save_model(const std::unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine);
-}
-}
diff --git a/paddle/fluid/train/imdb_demo/nets.py b/paddle/fluid/train/imdb_demo/nets.py
deleted file mode 100644
index a25e67e3b5d56d..00000000000000
--- a/paddle/fluid/train/imdb_demo/nets.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import time
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    bow net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def cnn_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            win_size=3):
-    """
-    conv net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=win_size,
-        act="tanh",
-        pool_type="max")
-
-    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
-
-    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def lstm_net(data,
-             label,
-             dict_dim,
-             emb_dim=128,
-             hid_dim=128,
-             hid_dim2=96,
-             class_dim=2,
-             emb_lr=30.0):
-    """
-    lstm net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr),
-        is_sparse=True)
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
-
-    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
-
-    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
-
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def gru_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            emb_lr=400.0):
-    """
-    gru net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
-    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
-    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
-    gru_max_tanh = fluid.layers.tanh(gru_max)
-    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh
deleted file mode 100644
index f71b4bac602a9e..00000000000000
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-set -exu
-build/demo_trainer --flagfile="train.cfg"
diff --git a/paddle/fluid/train/imdb_demo/save_model.cc b/paddle/fluid/train/imdb_demo/save_model.cc
deleted file mode 100644
index 49da550dbb7f52..00000000000000
--- a/paddle/fluid/train/imdb_demo/save_model.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "include/save_model.h"
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-using std::unique_ptr;
-
-namespace paddle {
-namespace framework {
-void save_model(const unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine) {
-  auto place = platform::CPUPlace();
-  const BlockDesc& global_block = main_program->Block(0);
-  std::vector<std::string> paralist;
-  for (auto* var : global_block.AllVars()) {
-    bool is_model_param = false;
-    for (auto param_name : param_names) {
-      if (var->Name() == param_name) {
-        is_model_param = true;
-        break;
-      }
-    }
-
-    if (!is_model_param) continue;
-
-    if (!save_combine) {
-      VLOG(3) << "model var name: %s" << var->Name().c_str();
-
-      paddle::framework::AttributeMap attrs;
-      attrs.insert({"file_path", model_name + "/" + var->Name()});
-      auto save_op = paddle::framework::OpRegistry::CreateOp(
-          "save", {{"X", {var->Name()}}}, {}, attrs);
-
-      save_op->Run(*scope, place);
-    } else {
-      paralist.push_back(var->Name());
-    }
-  }
-  if (save_combine) {
-    std::sort(paralist.begin(), paralist.end());
-    paddle::framework::AttributeMap attrs;
-    attrs.insert({"file_path", model_name});
-    auto save_op = paddle::framework::OpRegistry::CreateOp(
-        "save_combine", {{"X", paralist}}, {}, attrs);
-    save_op->Run(*scope, place);
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/train/imdb_demo/train.cfg b/paddle/fluid/train/imdb_demo/train.cfg
deleted file mode 100644
index 1821498890be8c..00000000000000
--- a/paddle/fluid/train/imdb_demo/train.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
---filelist=train_filelist.txt
---data_proto_desc=data.proto
---loss_name=mean_0.tmp_0
---startup_program_file=bow_startup_program
---main_program_file=bow_main_program
---save_dir=bow_model
---epoch_num=30
diff --git a/paddle/fluid/train/imdb_demo/train_filelist.txt b/paddle/fluid/train/imdb_demo/train_filelist.txt
deleted file mode 100644
index dcf088af417619..00000000000000
--- a/paddle/fluid/train/imdb_demo/train_filelist.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-train_data/part-0
-train_data/part-1
-train_data/part-10
-train_data/part-11
-train_data/part-2
-train_data/part-3
-train_data/part-4
-train_data/part-5
-train_data/part-6
-train_data/part-7
-train_data/part-8
-train_data/part-9
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
deleted file mode 100644
index 7a980cbac8b95f..00000000000000
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include <fstream>
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-
-DEFINE_string(dirname, "", "Directory of the train model.");
-
-namespace paddle {
-
-void Train(std::string model_dir) {
-  framework::InitDevices();
-  const auto cpu_place = platform::CPUPlace();
-  framework::Executor executor(cpu_place);
-  framework::Scope scope;
-
-  auto train_program = inference::Load(
-      &executor, &scope, model_dir + "__model_combined__.main_program",
-      model_dir + "__params_combined__");
-
-  std::string loss_name = "";
-  for (auto op_desc : train_program->Block(0).AllOps()) {
-    if (op_desc->Type() == "mean") {
-      loss_name = op_desc->Output("Out")[0];
-      break;
-    }
-  }
-
-  PADDLE_ENFORCE_NE(loss_name, "",
-                    platform::errors::NotFound("Loss name is not found."));
-
-  // prepare data
-  auto x_var = scope.Var("img");
-  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
-  x_tensor->Resize({64, 1, 28, 28});
-
-  auto x_data = x_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 64 * 28 * 28; ++i) {
-    x_data[i] = 1.0;
-  }
-
-  auto y_var = scope.Var("label");
-  auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
-  y_tensor->Resize({64, 1});
-  auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
-  for (int i = 0; i < 64 * 1; ++i) {
-    y_data[i] = static_cast<int64_t>(1);
-  }
-
-  auto loss_var = scope.Var(loss_name);
-  float first_loss = 0.0;
-  float last_loss = 0.0;
-  for (int i = 0; i < 100; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true,
-                 {loss_name, "img", "label"});
-    if (i == 0) {
-      first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
-    } else if (i == 99) {
-      last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
-    }
-  }
-  EXPECT_LT(last_loss, first_loss);
-}
-
-TEST(train, recognize_digits) {
-  CHECK(!FLAGS_dirname.empty());
-  Train(FLAGS_dirname + "recognize_digits_mlp.train.model/");
-  Train(FLAGS_dirname + "recognize_digits_conv.train.model/");
-}
-
-}  // namespace paddle
diff --git a/paddle/scripts/build_docker_images.sh b/paddle/scripts/build_docker_images.sh
index a90f0885294a9c..2b584cdca6b4ce 100644
--- a/paddle/scripts/build_docker_images.sh
+++ b/paddle/scripts/build_docker_images.sh
@@ -1,4 +1,19 @@
 #!/bin/sh
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -xe
 
 REPO="${REPO:-paddlepaddle}"
diff --git a/paddle/scripts/docker/root/.scripts/git-completion.sh b/paddle/scripts/docker/root/.scripts/git-completion.sh
index bdddef5ac2faf5..c43e88a4acd73a 100755
--- a/paddle/scripts/docker/root/.scripts/git-completion.sh
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
@@ -1,4 +1,19 @@
 #!bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 # bash/zsh completion support for core Git.
 #
diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh
index 1034b1c5c10435..cacec55d3bc228 100644
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 ## purple to echo
 function purple(){
     echo -e "\033[35m$1\033[0m"
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 2edb062ac806fd..20c8794ba634c7 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -52,6 +52,8 @@ if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
 if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
 if not defined LOG_LEVEL set LOG_LEVEL=normal
+if not defined PRECISION_TEST set PRECISION_TEST=OFF
+if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
@@ -359,9 +361,9 @@ if %GENERATOR% == "Ninja" (
     ninja -j %PARALLEL_PROJECT_COUNT%
 ) else (
     if "%WITH_CLCACHE%"=="OFF" (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     ) else (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     )
 )
 
@@ -460,27 +462,11 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
 pip install requests
-python %work_dir%\tools\get_quick_disable_lt.py > Output
-if %errorlevel%==0 (
-    set /p disable_ut_quickly=<Output
-    DEL Output
-    ) else (
-    set disable_ut_quickly=''
-)
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
 %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
 
-if "%NIGHTLY_MODE%"=="ON" (
-    set nightly_label="()"
-    ) else (
-    set nightly_label="(RUN_TYPE=NIGHTLY^|RUN_TYPE=DIST:NIGHTLY^|RUN_TYPE=EXCLUSIVE:NIGHTLY)"
-    echo    ========================================
-    echo    "Unittests with nightly labels  are only run at night"
-    echo    ========================================
-)
-
 if "%WITH_GPU%"=="ON" (
     goto:parallel_test_base_gpu
 ) else (
@@ -500,7 +486,15 @@ setlocal enabledelayedexpansion
 :: for /F %%# in ('cmd /C nvidia-smi -L ^|find "GPU" /C') do set CUDA_DEVICE_COUNT=%%#
 set CUDA_DEVICE_COUNT=1
 
-%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST%
+echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
+-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> %work_dir%\win_cmake.sh
+set FLAGS_fraction_of_gpu_memory_to_use=0.92
+
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
 
 goto:eof
 
@@ -509,7 +503,7 @@ echo    ========================================
 echo    Running CPU unit tests in parallel way ...
 echo    ========================================
 
-ctest.exe -E "(%disable_ut_quickly%)" -LE %nightly_label% --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
 
 goto:eof
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7a360ac22960e4..d834d1f87a273c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -238,7 +238,6 @@ function cmake_base() {
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON}
         -DWITH_ROCM=${WITH_ROCM:-OFF}
-        -DWITH_RCCL=${WITH_RCCL:-OFF}
         -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -276,7 +275,6 @@ EOF
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON} \
         -DWITH_ROCM=${WITH_ROCM:-OFF} \
-        -DWITH_RCCL=${WITH_RCCL:-OFF} \
         -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -404,7 +402,7 @@ EOF
         tar -czf paddle_inference.tgz paddle_inference
         buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
         echo "Paddle_Inference Size: $buildSize"
-        echo "ipipe_log_param_Paddle_Inference_Size: $buildSize"
+        echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         SYSTEM=`uname -s`
         if [ "$SYSTEM" == "Darwin" ]; then
@@ -414,10 +412,10 @@ EOF
         fi
         buildSize=$($com ${PADDLE_ROOT}/build |awk '{print $1}')
         echo "Build Size: $buildSize"
-        echo "ipipe_log_param_Build_Size: $buildSize"
+        echo "ipipe_log_param_Build_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
         PR_whlSize=$($com ${PADDLE_ROOT}/build/python/dist |awk '{print $1}')
         echo "PR whl Size: $PR_whlSize"
-        echo "ipipe_log_param_PR_whl_Size: $PR_whlSize"
+        echo "ipipe_log_param_PR_whl_Size: $PR_whlSize" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
 }
 
@@ -442,7 +440,7 @@ function cmake_gen_and_build() {
     build $2
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function build_mac() {
@@ -480,7 +478,7 @@ function cmake_gen_and_build_mac() {
     build_mac
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function run_test() {
@@ -684,7 +682,7 @@ EOF
         #mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
         paddle version
         # Recovery proxy to avoid failure in later steps
         set +x
@@ -991,12 +989,12 @@ function case_count(){
 EOF
     testcases=$1
     num=$(echo $testcases|grep -o '\^'|wc -l)
-    if [ "$2" == "" ]; then
+    if (( $2 == -1 )); then
         echo "exclusive TestCases count is $num"
-        echo "ipipe_log_param_Exclusive_TestCases_Count: $num"
+        echo "ipipe_log_param_Exclusive_TestCases_Count: $num" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         echo "$2 card TestCases count is $num"
-        echo "ipipe_log_param_${2}_Cards_TestCases_Count: $num"
+        echo "ipipe_log_param_${2}_Cards_TestCases_Count: $num" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
 }
 
@@ -1034,6 +1032,11 @@ function card_test() {
     set -m
     case_count $1 $2
     ut_startTime_s=`date +%s` 
+
+    testcases=$1
+    cardnumber=$2
+    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
+
     # get the CUDA device count, XPU device count is one
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
@@ -1043,20 +1046,13 @@ function card_test() {
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
 
-    testcases=$1
-    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
-    if (( $# > 1 )); then
-        cardnumber=$2
-        if (( $cardnumber > $CUDA_DEVICE_COUNT )); then
-            cardnumber=$CUDA_DEVICE_COUNT
-        fi
-        if (( $# > 2 )); then
-            parallel_job=`expr $3 \* $parallel_level_base`
-        else
-            parallel_job=$parallel_level_base
-        fi
-    else
+    if (( $cardnumber == -1 ));then
         cardnumber=$CUDA_DEVICE_COUNT
+    fi
+
+    if (( $# > 2 )); then
+        parallel_job=`expr $3 \* $parallel_level_base`
+    else
         parallel_job=$parallel_level_base
     fi
 
@@ -1098,12 +1094,12 @@ function card_test() {
     done
     wait; # wait for all subshells to finish
     ut_endTime_s=`date +%s`
-    if [ "$2" == "" ]; then
+    if (( $2 == -1 )); then
         echo "exclusive TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         echo "$2 card TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
     set +m
 }
@@ -1153,13 +1149,18 @@ set -x
 set +x
         EXIT_CODE=0;
         test_cases=$(ctest -N -V) # get all test cases
-        single_card_tests_eight_parallel='^job$'    # cases list which would run 8 job each time with single GPU
-        single_card_tests_tetrad_parallel='^job$'   # cases list which would run 4 job each time with single GPU
-        single_card_tests_non_parallel_1='^job$'    # cases list which would run 1 job each time with single GPU
-        single_card_tests_non_parallel_2='^job$'    # cases list which would run 1 job each time with single GPU
-        single_card_tests='^job$' # all cases list which would take one graph card
-        exclusive_tests=''        # cases list which would be run exclusively
-        multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
+        # Note(zhouwei): Parallel runs are relative to 'CTEST_PARALLEL_LEVEL', e.g: '4 job each time' means 4*CTEST_PARALLEL_LEVEL
+        single_card_tests_high_parallel='^job$'     # cases list which would run the most job each time with single GPU
+        single_card_tests_two_parallel='^job$'      # cases list which would run 2 job each time with single GPU
+        single_card_tests_non_parallel='^job$'      # cases list which would run 1 job each time with single GPU
+        single_card_tests='^job$'                   # all cases list which would take single GPU
+        
+        multiple_card_tests_two_parallel='^job$'    # cases list which would run 2 job each time with multiple GPUs, most cases would be two GPUs
+        multiple_card_tests_non_parallel='^job$'    # cases list which would run 1 job each time with multiple GPUs, most cases would be two GPUs
+        
+        exclusive_tests_two_parallel='^job$'        # cases list which would run 2 job exclusively(with all GPUs)
+        exclusive_tests_non_parallel='^job$'        # cases list which would run 1 job exclusively(with all GPUs)
+        
         is_exclusive=''           # indicate whether the case is exclusive type
         is_multicard=''           # indicate whether the case is multiple GPUs type
         is_nightly=''             # indicate whether the case will only run at night
@@ -1167,9 +1168,10 @@ set +x
 
         UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
         output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
-        eight_parallel_job=$(echo $output | cut -d ";" -f 1)
-        tetrad_parallel_jog=$(echo $output | cut -d ";" -f 2)
-        non_parallel_job=$(echo $output | cut -d ";" -f 3)
+        cpu_parallel_job=$(echo $output | cut -d ";" -f 1)
+        tetrad_parallel_job=$(echo $output | cut -d ";" -f 2)
+        two_parallel_job=$(echo $output | cut -d ";" -f 3)
+        non_parallel_job=$(echo $output | cut -d ";" -f 4)
         while read -r line; do
             if [[ "$line" == "" ]]; then
                 continue
@@ -1211,26 +1213,24 @@ set +x
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
-                    if [[ "$exclusive_tests" == "" ]]; then
-                        exclusive_tests="^$testcase$"
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$"
                     else
-                        exclusive_tests="$exclusive_tests|^$testcase$"
+                        exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$"
                     fi
                 elif [[ "$is_multicard" != "" ]]; then
-                    if [[ "$multiple_card_tests" == "" ]]; then
-                        multiple_card_tests="^$testcase$"
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$"
                     else
-                        multiple_card_tests="$multiple_card_tests|^$testcase$"
+                        multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$"
                     fi
                 else
-                    if [[ $(echo $eight_parallel_job | grep $testcase) != "" ]]; then
-                        single_card_tests_eight_parallel="$single_card_tests_eight_parallel|^$testcase$"
-                    elif [[ $(echo $tetrad_parallel_jog | grep $testcase) != "" ]]; then
-                        single_card_tests_tetrad_parallel="$single_card_tests_tetrad_parallel|^$testcase$"
-                    elif [[ "${#single_card_tests_non_parallel_1}" -gt 10000 ]];then
-                        single_card_tests_non_parallel_2="$single_card_tests_non_parallel_2|^$testcase$"
+                    if [[ $(echo $cpu_parallel_job | grep -o $testcase) != "" ]]; then
+                        single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$"
+                    elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$"
                     else
-                        single_card_tests_non_parallel_1="$single_card_tests_non_parallel_1|^$testcase$"
+                        single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$"
                     fi
                     single_card_tests="$single_card_tests|^$testcase$"
                 fi
@@ -1241,12 +1241,13 @@ set +x
                 testcase=''
         done <<< "$test_cases";
 
-        card_test "$single_card_tests_eight_parallel" 1 8     # run cases 8 job each time with single GPU
-        card_test "$single_card_tests_tetrad_parallel" 1 4    # run cases 4 job each time with single GPU
-        card_test "$single_card_tests_non_parallel_1" 1       # run cases 1 job each time with single GPU
-        card_test "$single_card_tests_non_parallel_2" 1       # run cases 1 job each time with single GPU
-        card_test "$multiple_card_tests" 2    # run cases with two GPUs
-        card_test "$exclusive_tests"          # run cases exclusively, in this cases would be run with 4/8 GPUs
+        card_test "$single_card_tests_high_parallel" 1 8        # run cases the most each time with single GPU
+        card_test "$single_card_tests_two_parallel" 1 2         # run cases 2 job each time with single GPU
+        card_test "$single_card_tests_non_parallel" 1           # run cases 1 job each time with single GPU
+        card_test "$multiple_card_tests_two_parallel" 2 2       # run cases 2 job each time with two GPUs
+        card_test "$multiple_card_tests_non_parallel" 2         # run cases 1 job each time with two GPUs
+        card_test "$exclusive_tests_two_parallel" -1 2          # run cases exclusively, in this cases would be run with 2/4/8 GPUs
+        card_test "$exclusive_tests_non_parallel" -1            # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         collect_failed_tests
         rm -f $tmp_dir/*
         exec_times=0
@@ -1319,7 +1320,7 @@ set +x
                         fi
 
                         if [[ "$exclusive_retry" != "" ]]; then
-                            card_test "$exclusive_retry"
+                            card_test "$exclusive_retry" -1
                         fi
                         
                         exec_times=$[$exec_times+1]
@@ -1445,7 +1446,7 @@ function parallel_test() {
     fi
     ut_total_endTime_s=`date +%s`
     echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s"
-    echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s"
+    echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function enable_unused_var_check() {
@@ -1725,7 +1726,7 @@ EOF
     fi
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 
     build_size "paddle_inference"
 }
@@ -1757,7 +1758,7 @@ EOF
     EXIT_CODE=$?
     fluid_endTime_s=`date +%s`
     echo "test_fluid_lib Total Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s"
-    echo "ipipe_log_param_Test_Fluid_Lib_Total_Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s"          
+    echo "ipipe_log_param_Test_Fluid_Lib_Total_Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt     
     ./clean.sh
     if [[ "$EXIT_CODE" != "0" ]]; then
         exit 8;
@@ -1804,17 +1805,17 @@ function example() {
 function collect_ccache_hits() {
     rate=$(ccache -s | grep 'cache hit rate' | awk '{print $4}')
     echo "ccache hit rate: ${rate}%"
-    echo "ipipe_log_param_Ccache_Hit_Rate: ${rate}%"
+    echo "ipipe_log_param_Ccache_Hit_Rate: ${rate}%" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 
 function test_op_benchmark() {
     # The PR will pass quickly when get approval from specific person.
-    # Xreki 12538138, luotao1 6836917, GaoWei8 53294385
+    # Xreki 12538138, luotao1 6836917, Avin0323 16167147
     set +x
     approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
     if [ "${approval_line}" != "" ]; then
-        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 53294385 12538138 6836917)
+        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 16167147 12538138 6836917)
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "TRUE" ]; then
             echo "==================================="
@@ -1827,6 +1828,10 @@ function test_op_benchmark() {
     bash ${PADDLE_ROOT}/tools/test_op_benchmark.sh
 }
 
+function test_model_benchmark() {
+    bash ${PADDLE_ROOT}/tools/test_model_benchmark.sh
+}
+
 function summary_check_problems() {
     set +x
     local check_style_code=$1
@@ -2021,11 +2026,20 @@ function main() {
       test_op_benchmark)
         test_op_benchmark
         ;;
+      test_model_benchmark)
+        test_model_benchmark
+        ;;
       *)
         print_usage
         exit 1
         ;;
       esac
+      set +x
+      if [[ -f ${PADDLE_ROOT}/build/build_summary.txt ]];then
+        echo "=====================build summary======================"
+        cat ${PADDLE_ROOT}/build/build_summary.txt
+        echo "========================================================"
+      fi
       echo "paddle_build script finished as expected"
 }
 
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 38ed76a87cd3e4..a886f7a0298373 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/npu_info.h"
 
 int main(int argc, char** argv) {
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
@@ -38,11 +39,13 @@ int main(int argc, char** argv) {
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
   envs.push_back("fraction_of_gpu_memory_to_use");
   envs.push_back("initial_gpu_memory_in_mb");
   envs.push_back("reallocate_gpu_memory_in_mb");
   envs.push_back("allocator_strategy");
+  envs.push_back("selected_gpus");
 #elif __clang__
   envs.push_back("use_mkldnn");
   envs.push_back("initial_cpu_memory_in_mb");
@@ -61,6 +64,10 @@ int main(int argc, char** argv) {
   undefok.push_back("initial_cpu_memory_in_mb");
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+  envs.push_back("selected_npus");
+#endif
+
   char* env_str = nullptr;
   if (envs.size() > 0) {
     std::string env_string = "--tryfromenv=";
@@ -93,6 +100,10 @@ int main(int argc, char** argv) {
 
   int ret = RUN_ALL_TESTS();
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  paddle::platform::AclInstance::Instance().Finalize();
+#endif
+
   if (env_str) free(env_str);
   if (undefok_str) free(undefok_str);
 
diff --git a/patches/eigen/BinaryFunctors.h b/patches/eigen/BinaryFunctors.h
deleted file mode 100644
index 54d0395507a122..00000000000000
--- a/patches/eigen/BinaryFunctors.h
+++ /dev/null
@@ -1,509 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// clang-format off
-
-#ifndef EIGEN_BINARY_FUNCTORS_H
-#define EIGEN_BINARY_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- associative binary functors ----------
-
-template<typename Arg1, typename Arg2>
-struct binary_op_base
-{
-  typedef Arg1 first_argument_type;
-  typedef Arg2 second_argument_type;
-};
-
-/** \internal
-  * \brief Template functor to compute the sum of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-#else
-  scalar_sum_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::padd(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
-    // TODO vectorize mixed sum
-  };
-};
-
-/** \internal
-  * \brief Template specialization to deprecate the summation of boolean expressions.
-  * This is required to solve Bug 426.
-  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
-  */
-template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
-  scalar_sum_op() {}
-};
-
-
-/** \internal
-  * \brief Template functor to compute the product of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-#else
-  scalar_product_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmul(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_mul(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-    // TODO vectorize mixed product
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the conjugate product of two scalars
-  *
-  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-
-  enum {
-    Conj = NumTraits<LhsScalar>::IsComplex
-  };
-  
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
-  
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = NumTraits<LhsScalar>::MulCost,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the min of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_min(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the max of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_max_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
-  };
-};
-
-/** \internal
-  * \brief Template functors for comparison of two scalars
-  * \todo Implement packet-comparisons
-  */
-template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
-
-template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
-struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = false
-  };
-};
-
-template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
-struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
-  typedef bool type;
-};
-
-
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
-};
-
-
-/** \internal
-  * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
-  *
-  * \sa MatrixBase::stableNorm(), class Redux
-  */
-template<typename Scalar>
-struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
-{
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
-  {
-    // This functor is used by hypotNorm only for which it is faster to first apply abs
-    // on all coefficients prior to reduction through hypot.
-    // This way we avoid calling abs on positive and real entries, and this also permits
-    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
-    // through the same functor...
-    return internal::positive_real_hypot(x,y);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
-  enum
-  {
-    Cost = 3 * NumTraits<Scalar>::AddCost +
-           2 * NumTraits<Scalar>::MulCost +
-           2 * scalar_div_cost<Scalar,false>::value,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the pow of two scalars
-  */
-template<typename Scalar, typename Exponent>
-struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
-{
-  typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
-#else
-  scalar_pow_op() {
-    typedef Scalar LhsScalar;
-    typedef Exponent RhsScalar;
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC
-  inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename Exponent>
-struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-
-
-
-//---------- non associative binary functors ----------
-
-/** \internal
-  * \brief Template functor to compute the difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator-
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-#else
-  scalar_difference_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::psub(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the quotient of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator/()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_quotient_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-#else
-  scalar_quotient_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pdiv(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
-  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
-  enum {
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
-    Cost = scalar_div_cost<result_type,PacketAccess>::value
-  };
-};
-
-
-
-/** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
-struct scalar_boolean_and_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-};
-template<> struct functor_traits<scalar_boolean_and_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
-struct scalar_boolean_or_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-};
-template<> struct functor_traits<scalar_boolean_or_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the xor of two booleans
- *
- * \sa class CwiseBinaryOp, ArrayBase::operator^
- */
-struct scalar_boolean_xor_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
-};
-template<> struct functor_traits<scalar_boolean_xor_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_absolute_difference_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_absolute_difference_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op)
-#else
-  scalar_absolute_difference_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return numext::absdiff(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pabsdiff(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_absolute_difference_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
-  };
-};
-
-
-
-//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
-
-// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
-// They are analogues to std::binder1st/binder2nd but with the following differences:
-//  - they are compatible with packetOp
-//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
-template<typename BinaryOp> struct bind1st_op : BinaryOp {
-
-  typedef typename BinaryOp::first_argument_type  first_argument_type;
-  typedef typename BinaryOp::second_argument_type second_argument_type;
-  typedef typename BinaryOp::result_type          result_type;
-
-  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
-
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
-  { return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
-
-  first_argument_type m_value;
-};
-template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
-
-
-template<typename BinaryOp> struct bind2nd_op : BinaryOp {
-
-  typedef typename BinaryOp::first_argument_type  first_argument_type;
-  typedef typename BinaryOp::second_argument_type second_argument_type;
-  typedef typename BinaryOp::result_type          result_type;
-
-  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
-
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
-
-  second_argument_type m_value;
-};
-template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_BINARY_FUNCTORS_H
-
-// clang-format on
diff --git a/patches/eigen/Geometry_SSE.h b/patches/eigen/Geometry_SSE.h
deleted file mode 100644
index f45d5eb8a01ffe..00000000000000
--- a/patches/eigen/Geometry_SSE.h
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GEOMETRY_SSE_H
-#define EIGEN_GEOMETRY_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float> {
-  enum {
-    AAlignment = traits<Derived>::Alignment,
-    BAlignment = traits<OtherDerived>::Alignment,
-    ResAlignment = traits<Quaternion<float>>::Alignment
-  };
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a,
-                                      const QuaternionBase<OtherDerived>& _b) {
-    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
-    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
-    Quaternion<float> res;
-    const __m128 mask = _mm_setr_ps(0.f, 0.f, 0.f, -0.f);
-    __m128 a = ae.template packet<AAlignment, __m128>(0);
-    __m128 b = be.template packet<BAlignment, __m128>(0);
-    __m128 s1 =
-        pmul(vec4f_swizzle1(a, 1, 2, 0, 2), vec4f_swizzle1(b, 2, 0, 1, 2));
-    __m128 s2 =
-        pmul(vec4f_swizzle1(a, 3, 3, 3, 1), vec4f_swizzle1(b, 0, 1, 2, 1));
-    pstoret<float, __m128, ResAlignment>(
-        &res.x(),
-        padd(psub(pmul(a, vec4f_swizzle1(b, 3, 3, 3, 3)),
-                  pmul(vec4f_swizzle1(a, 2, 0, 1, 0),
-                       vec4f_swizzle1(b, 1, 2, 0, 0))),
-             pxor(mask, padd(s1, s2))));
-
-    return res;
-  }
-};
-
-template <class Derived>
-struct quat_conj<Architecture::SSE, Derived, float> {
-  enum { ResAlignment = traits<Quaternion<float>>::Alignment };
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& q) {
-    evaluator<typename Derived::Coefficients> qe(q.coeffs());
-    Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(-0.f, -0.f, -0.f, 0.f);
-    pstoret<float, Packet4f, ResAlignment>(
-        &res.x(),
-        pxor(mask,
-             qe.template packet<traits<Derived>::Alignment, Packet4f>(0)));
-    return res;
-  }
-};
-
-template <typename VectorLhs, typename VectorRhs>
-struct cross3_impl<Architecture::SSE, VectorLhs, VectorRhs, float, true> {
-  enum {
-    ResAlignment =
-        traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
-  };
-  static inline typename plain_matrix_type<VectorLhs>::type run(
-      const VectorLhs& lhs, const VectorRhs& rhs) {
-    evaluator<VectorLhs> lhs_eval(lhs);
-    evaluator<VectorRhs> rhs_eval(rhs);
-    __m128 a =
-        lhs_eval.template packet<traits<VectorLhs>::Alignment, __m128>(0);
-    __m128 b =
-        rhs_eval.template packet<traits<VectorRhs>::Alignment, __m128>(0);
-    __m128 mul1 =
-        pmul(vec4f_swizzle1(a, 1, 2, 0, 3), vec4f_swizzle1(b, 2, 0, 1, 3));
-    __m128 mul2 =
-        pmul(vec4f_swizzle1(a, 2, 0, 1, 3), vec4f_swizzle1(b, 1, 2, 0, 3));
-    typename plain_matrix_type<VectorLhs>::type res;
-    pstoret<float, __m128, ResAlignment>(&res.x(), psub(mul1, mul2));
-    return res;
-  }
-};
-
-template <class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double> {
-  enum {
-    BAlignment = traits<OtherDerived>::Alignment,
-    ResAlignment = traits<Quaternion<double>>::Alignment
-  };
-
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a,
-                                       const QuaternionBase<OtherDerived>& _b) {
-    const Packet2d mask =
-        _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
-
-    Quaternion<double> res;
-
-    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
-    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
-
-    const double* a = _a.coeffs().data();
-    Packet2d b_xy = be.template packet<BAlignment, Packet2d>(0);
-    Packet2d b_zw = be.template packet<BAlignment, Packet2d>(2);
-    Packet2d a_xx = pset1<Packet2d>(a[0]);
-    Packet2d a_yy = pset1<Packet2d>(a[1]);
-    Packet2d a_zz = pset1<Packet2d>(a[2]);
-    Packet2d a_ww = pset1<Packet2d>(a[3]);
-
-    // two temporaries:
-    Packet2d t1, t2;
-
-    /*
-     * t1 = ww*xy + yy*zw
-     * t2 = zz*xy - xx*zw
-     * res.xy = t1 +/- swap(t2)
-     */
-    t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
-    t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
-#ifdef EIGEN_VECTORIZE_SSE3
-    EIGEN_UNUSED_VARIABLE(mask)
-    pstoret<double, Packet2d, ResAlignment>(&res.x(),
-                                            _mm_addsub_pd(t1, preverse(t2)));
-#else
-    pstoret<double, Packet2d, ResAlignment>(&res.x(),
-                                            padd(t1, pxor(mask, preverse(t2))));
-#endif
-
-    /*
-     * t1 = ww*zw - yy*xy
-     * t2 = zz*zw + xx*xy
-     * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
-     */
-    t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
-    t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
-#ifdef EIGEN_VECTORIZE_SSE3
-    EIGEN_UNUSED_VARIABLE(mask)
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
-#else
-    pstoret<double, Packet2d, ResAlignment>(&res.z(),
-                                            psub(t1, pxor(mask, preverse(t2))));
-#endif
-
-    return res;
-  }
-};
-
-template <class Derived>
-struct quat_conj<Architecture::SSE, Derived, double> {
-  enum { ResAlignment = traits<Quaternion<double>>::Alignment };
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& q) {
-    evaluator<typename Derived::Coefficients> qe(q.coeffs());
-    Quaternion<double> res;
-    const Packet2d mask0 = _mm_setr_pd(-0., -0.);
-    const Packet2d mask2 = _mm_setr_pd(-0., 0.);
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.x(),
-        pxor(mask0,
-             qe.template packet<traits<Derived>::Alignment, Packet2d>(0)));
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.z(),
-        pxor(mask2,
-             qe.template packet<traits<Derived>::Alignment, Packet2d>(2)));
-    return res;
-  }
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_GEOMETRY_SSE_H
diff --git a/patches/eigen/Half.h b/patches/eigen/Half.h
deleted file mode 100644
index 2d4e0164b5906f..00000000000000
--- a/patches/eigen/Half.h
+++ /dev/null
@@ -1,733 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-//
-// The conversion routines are Copyright (c) Fabian Giesen, 2016.
-// The original license follows:
-//
-// Copyright (c) Fabian Giesen, 2016
-// All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting from CUDA's __half struct) with
-// operator overloads such that it behaves basically as an arithmetic
-// type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
-// to disk and the likes), but fast on GPUs.
-
-#ifndef EIGEN_HALF_CUDA_H
-#define EIGEN_HALF_CUDA_H
-
-#if __cplusplus > 199711L
-#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
-#else
-#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
-#endif
-
-namespace Eigen {
-
-struct half;
-
-namespace half_impl {
-
-#if !defined(EIGEN_HAS_CUDA_FP16)
-// Make our own __half_raw definition that is similar to CUDA's.
-struct __half_raw {
-  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
-  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
-  unsigned short x;
-};
-#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
-#endif
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
-raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
-
-struct half_base : public __half_raw {
-  EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
-  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
-    EIGEN_CUDACC_VER >= 90000
-  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
-#endif
-};
-
-}  // namespace half_impl
-
-// Class definition.
-struct half : public half_impl::half_base {
-#if !defined(EIGEN_HAS_CUDA_FP16) || \
-    (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
-  typedef half_impl::__half_raw __half_raw;
-#endif
-
-  EIGEN_DEVICE_FUNC half() {}
-
-  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
-  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
-    EIGEN_CUDACC_VER >= 90000
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
-#endif
-
-  explicit EIGEN_DEVICE_FUNC half(bool b)
-      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
-  template <class T>
-  explicit EIGEN_DEVICE_FUNC half(const T& val)
-      : half_impl::half_base(
-            half_impl::float_to_half_rtne(static_cast<float>(val))) {}
-  explicit EIGEN_DEVICE_FUNC half(float f)
-      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
-    // +0.0 and -0.0 become false, everything else becomes true.
-    return (x & 0x7fff) != 0;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
-    return static_cast<signed char>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
-    return static_cast<unsigned char>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
-    return static_cast<short>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
-    return static_cast<unsigned short>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
-    return static_cast<int>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
-    return static_cast<unsigned int>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
-    return static_cast<long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
-    return static_cast<unsigned long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
-    return static_cast<long long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
-    return static_cast<unsigned long long>(half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    return half_impl::half_to_float(*this);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return static_cast<double>(half_impl::half_to_float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
-    x = other.x;
-    return *this;
-  }
-};
-
-namespace half_impl {
-
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-
-// Intrinsics for native fp16 support. Note that on current hardware,
-// these are no faster than fp32 arithmetic (you need to use the half2
-// versions to get the ALU speed increased), but you do save the
-// conversion steps back and forth.
-
-EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
-  return __hadd(a, b);
-#endif
-}
-EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) {
-  return __hmul(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) {
-  return __hsub(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-  return __hdiv(a, b);
-#else
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#endif
-}
-EIGEN_STRONG_INLINE __device__ half operator-(const half& a) {
-  return __hneg(a);
-}
-EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
-  a = a + b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
-  a = a * b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
-  a = a - b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
-  a = a / b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) {
-  return __heq(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) {
-  return __hne(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) {
-  return __hlt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) {
-  return __hle(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) {
-  return __hgt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) {
-  return __hge(a, b);
-}
-
-#else  // Emulate support for half floats
-
-// Definitions for CPUs and older CUDA, mostly working through conversion
-// to/from fp32.
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a,
-                                                     const half& b) {
-  return half(float(a) + float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a,
-                                                     const half& b) {
-  return half(float(a) * float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a,
-                                                     const half& b) {
-  return half(float(a) - float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a,
-                                                     const half& b) {
-  return half(float(a) / float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
-  half result;
-  result.x = a.x ^ 0x8000;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
-  a = half(float(a) + float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
-  a = half(float(a) * float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
-  a = half(float(a) - float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
-  a = half(float(a) / float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a,
-                                                      const half& b) {
-  return float(a) == float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a,
-                                                      const half& b) {
-  return float(a) != float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a,
-                                                     const half& b) {
-  return float(a) < float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a,
-                                                      const half& b) {
-  return float(a) <= float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a,
-                                                     const half& b) {
-  return float(a) > float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a,
-                                                      const half& b) {
-  return float(a) >= float(b);
-}
-
-#endif  // Emulate support for half floats
-
-// Division by an index. Do it in full float precision to avoid accuracy
-// issues in converting the denominator to half.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
-  return half(static_cast<float>(a) / static_cast<float>(b));
-}
-
-// Conversion routines, including fallbacks for the host or older CUDA.
-// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
-// these in hardware. If we need more performance on older/other CPUs, they are
-// also possible to vectorize directly.
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
-raw_uint16_to_half(unsigned short x) {
-  __half_raw h;
-  h.x = x;
-  return h;
-}
-
-union FP32 {
-  unsigned int u;
-  float f;
-};
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 300
-  __half tmp_ff = __float2half(ff);
-  return *(__half_raw*)&tmp_ff;
-
-#elif defined(EIGEN_HAS_FP16_C)
-  __half_raw h;
-  h.x = _cvtss_sh(ff, 0);
-  return h;
-
-#else
-  FP32 f;
-  f.f = ff;
-
-  const FP32 f32infty = {255 << 23};
-  const FP32 f16max = {(127 + 16) << 23};
-  const FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
-  unsigned int sign_mask = 0x80000000u;
-  __half_raw o;
-  o.x = static_cast<unsigned short>(0x0u);
-
-  unsigned int sign = f.u & sign_mask;
-  f.u ^= sign;
-
-  // NOTE all the integer compares in this function can be safely
-  // compiled into signed compares since all operands are below
-  // 0x80000000. Important if you want fast straight SSE2 code
-  // (since there's no unsigned PCMPGTD).
-
-  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
-    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
-  } else {                    // (De)normalized number or zero
-    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
-      // use a magic value to align our 10 mantissa bits at the bottom of
-      // the float. as long as FP addition is round-to-nearest-even this
-      // just works.
-      f.f += denorm_magic.f;
-
-      // and one integer subtract of the bias later, we have our final float!
-      o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
-    } else {
-      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
-
-      // update exponent, rounding bias part 1
-      f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
-      // rounding bias part 2
-      f.u += mant_odd;
-      // take the bits!
-      o.x = static_cast<unsigned short>(f.u >> 13);
-    }
-  }
-
-  o.x |= static_cast<unsigned short>(sign >> 16);
-  return o;
-#endif
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 300
-  return __half2float(h);
-
-#elif defined(EIGEN_HAS_FP16_C)
-  return _cvtsh_ss(h.x);
-
-#else
-  const FP32 magic = {113 << 23};
-  const unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
-  FP32 o;
-
-  o.u = (h.x & 0x7fff) << 13;            // exponent/mantissa bits
-  unsigned int exp = shifted_exp & o.u;  // just the exponent
-  o.u += (127 - 15) << 23;               // exponent adjust
-
-  // handle exponent special cases
-  if (exp == shifted_exp) {   // Inf/NaN?
-    o.u += (128 - 16) << 23;  // extra exp adjust
-  } else if (exp == 0) {      // Zero/Denormal?
-    o.u += 1 << 23;           // extra exp adjust
-    o.f -= magic.f;           // renormalize
-  }
-
-  o.u |= (h.x & 0x8000) << 16;  // sign bit
-  return o.f;
-#endif
-}
-
-// --- standard functions ---
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
-  return (a.x & 0x7fff) == 0x7c00;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hisnan(a);
-#else
-  return (a.x & 0x7fff) > 0x7c00;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
-  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
-  half result;
-  result.x = a.x & 0x7FFF;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 530
-  return half(hexp(a));
-#else
-  return half(::expf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
-  return half(numext::expm1(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && \
-    defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-  return half(::hlog(a));
-#else
-  return half(::logf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
-  return half(numext::log1p(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
-  return half(::log10f(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 530
-  return half(hsqrt(a));
-#else
-  return half(::sqrtf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
-  return half(::powf(float(a), float(b)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
-  return half(::sinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
-  return half(::cosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
-  return half(::tanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
-  return half(::tanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 300
-  return half(hfloor(a));
-#else
-  return half(::floorf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 300
-  return half(hceil(a));
-#else
-  return half(::ceilf(float(a)));
-#endif
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hlt(b, a) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f2 < f1 ? b : a;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hlt(a, b) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f1 < f2 ? b : a;
-#endif
-}
-
-EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
-  os << static_cast<float>(v);
-  return os;
-}
-
-}  // end namespace half_impl
-
-// import Eigen::half_impl::half into Eigen namespace
-// using half_impl::half;
-
-namespace internal {
-
-template <>
-struct random_default_impl<half, false, false> {
-  static inline half run(const half& x, const half& y) {
-    return x + (y - x) * half(float(std::rand()) / float(RAND_MAX));
-  }
-  static inline half run() { return run(half(-1.f), half(1.f)); }
-};
-
-template <>
-struct is_arithmetic<half> {
-  enum { value = true };
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-namespace std {
-template <>
-struct numeric_limits<Eigen::half> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;      // according to
-                                      // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int max_digits10 = 5;  // according to
-                                      // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static Eigen::half(min)() {
-    return Eigen::half_impl::raw_uint16_to_half(0x400);
-  }
-  static Eigen::half lowest() {
-    return Eigen::half_impl::raw_uint16_to_half(0xfbff);
-  }
-  static Eigen::half(max)() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7bff);
-  }
-  static Eigen::half epsilon() {
-    return Eigen::half_impl::raw_uint16_to_half(0x0800);
-  }
-  static Eigen::half round_error() { return Eigen::half(0.5); }
-  static Eigen::half infinity() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7c00);
-  }
-  static Eigen::half quiet_NaN() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7e00);
-  }
-  static Eigen::half signaling_NaN() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7e00);
-  }
-  static Eigen::half denorm_min() {
-    return Eigen::half_impl::raw_uint16_to_half(0x1);
-  }
-};
-}
-
-namespace Eigen {
-
-template <>
-struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
-    return half_impl::raw_uint16_to_half(0x0800);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
-    return Eigen::half(1e-2f);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
-    return half_impl::raw_uint16_to_half(0x7bff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
-    return half_impl::raw_uint16_to_half(0xfbff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
-    return half_impl::raw_uint16_to_half(0x7c00);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
-    return half_impl::raw_uint16_to_half(0x7c01);
-  }
-};
-
-}  // end namespace Eigen
-
-// C-like standard mathematical functions and trancendentals.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
-  Eigen::half result;
-  result.x = a.x & 0x7FFF;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
-  return Eigen::half(::expf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return Eigen::half(::hlog(a));
-#else
-  return Eigen::half(::logf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
-  return Eigen::half(::sqrtf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a,
-                                                       const Eigen::half& b) {
-  return Eigen::half(::powf(float(a), float(b)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
-  return Eigen::half(::floorf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
-  return Eigen::half(::ceilf(float(a)));
-}
-
-namespace std {
-
-#if __cplusplus > 199711L
-template <>
-struct hash<Eigen::half> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(
-      const Eigen::half& a) const {
-    return static_cast<std::size_t>(a.x);
-  }
-};
-#endif
-
-}  // end namespace std
-
-// Add the missing shfl_xor intrinsic
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var,
-                                                      int laneMask,
-                                                      int width = warpSize) {
-#if EIGEN_CUDACC_VER < 90000
-  return static_cast<Eigen::half>(
-      __shfl_xor(static_cast<float>(var), laneMask, width));
-#else
-  return static_cast<Eigen::half>(
-      __shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
-#endif
-}
-#endif
-
-// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(
-    const Eigen::half* ptr) {
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
-}
-#endif
-
-#if defined(EIGEN_CUDA_ARCH)
-namespace Eigen {
-namespace numext {
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
-  return (half_impl::isnan)(h);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
-  return (half_impl::isinf)(h);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
-  return (half_impl::isfinite)(h);
-}
-
-}  // namespace Eigen
-}  // namespace numext
-#endif
-
-#endif  // EIGEN_HALF_CUDA_H
diff --git a/patches/eigen/MathFunctions.h b/patches/eigen/MathFunctions.h
deleted file mode 100644
index 9f6a4d0e3328ff..00000000000000
--- a/patches/eigen/MathFunctions.h
+++ /dev/null
@@ -1,1938 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MATHFUNCTIONS_H
-#define EIGEN_MATHFUNCTIONS_H
-
-// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
-// TODO this should better be moved to NumTraits
-#define EIGEN_PI \
-  3.141592653589793238462643383279502884197169399375105820974944592307816406L
-
-namespace Eigen {
-
-// On WINCE, std::abs is defined for int only, so let's defined our own
-// overloads:
-// This issue has been confirmed with MSVC 2008 only, but the issue might exist
-// for more recent versions too.
-#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC <= 1500
-long abs(long x) { return (labs(x)); }
-double abs(double x) { return (fabs(x)); }
-float abs(float x) { return (fabsf(x)); }
-long double abs(long double x) { return (fabsl(x)); }
-#endif
-
-namespace internal {
-
-/** \internal \class global_math_functions_filtering_base
-  *
-  * What it does:
-  * Defines a typedef 'type' as follows:
-  * - if type T has a member typedef
- * Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
-  *   global_math_functions_filtering_base<T>::type is a typedef for it.
-  * - otherwise, global_math_functions_filtering_base<T>::type is a typedef for
- * T.
-  *
-  * How it's used:
-  * To allow to defined the global math functions (like sin...) in certain
- * cases, like the Array expressions.
-  * When you do sin(array1+array2), the object array1+array2 has a complicated
- * expression type, all what you want to know
-  * is that it inherits ArrayBase. So we implement a partial specialization of
- * sin_impl for ArrayBase<Derived>.
-  * So we must make sure to use sin_impl<ArrayBase<Derived> > and not
- * sin_impl<Derived>, otherwise our partial specialization
-  * won't be used. How does sin know that? That's exactly what
- * global_math_functions_filtering_base tells it.
-  *
-  * How it's implemented:
-  * SFINAE in the style of enable_if. Highly susceptible of breaking compilers.
- * With GCC, it sure does work, but if you replace
-  * the typename dummy by an integer template parameter, it doesn't work
- * anymore!
-  */
-
-template <typename T, typename dummy = void>
-struct global_math_functions_filtering_base {
-  typedef T type;
-};
-
-template <typename T>
-struct always_void {
-  typedef void type;
-};
-
-template <typename T>
-struct global_math_functions_filtering_base<
-    T,
-    typename always_void<
-        typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::
-        type> {
-  typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type;
-};
-
-#define EIGEN_MATHFUNC_IMPL(func, scalar)                             \
-  Eigen::internal::func##_impl<                                       \
-      typename Eigen::internal::global_math_functions_filtering_base< \
-          scalar>::type>
-#define EIGEN_MATHFUNC_RETVAL(func, scalar)                           \
-  typename Eigen::internal::func##_retval<                            \
-      typename Eigen::internal::global_math_functions_filtering_base< \
-          scalar>::type>::type
-
-/****************************************************************************
-* Implementation of real                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct real_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) { return x; }
-};
-
-template <typename Scalar>
-struct real_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    using std::real;
-    return real(x);
-  }
-};
-
-template <typename Scalar>
-struct real_impl : real_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct real_impl<std::complex<T>> {
-  typedef T RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline T run(const std::complex<T>& x) { return x.real(); }
-};
-#endif
-
-template <typename Scalar>
-struct real_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of imag                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct imag_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar&) { return RealScalar(0); }
-};
-
-template <typename Scalar>
-struct imag_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    using std::imag;
-    return imag(x);
-  }
-};
-
-template <typename Scalar>
-struct imag_impl : imag_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct imag_impl<std::complex<T>> {
-  typedef T RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline T run(const std::complex<T>& x) { return x.imag(); }
-};
-#endif
-
-template <typename Scalar>
-struct imag_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of real_ref                                             *
-****************************************************************************/
-
-template <typename Scalar>
-struct real_ref_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar& run(Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[0];
-  }
-  EIGEN_DEVICE_FUNC
-  static inline const RealScalar& run(const Scalar& x) {
-    return reinterpret_cast<const RealScalar*>(&x)[0];
-  }
-};
-
-template <typename Scalar>
-struct real_ref_retval {
-  typedef typename NumTraits<Scalar>::Real& type;
-};
-
-/****************************************************************************
-* Implementation of imag_ref                                             *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct imag_ref_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar& run(Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[1];
-  }
-  EIGEN_DEVICE_FUNC
-  static inline const RealScalar& run(const Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[1];
-  }
-};
-
-template <typename Scalar>
-struct imag_ref_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(Scalar&) { return Scalar(0); }
-  EIGEN_DEVICE_FUNC
-  static inline const Scalar run(const Scalar&) { return Scalar(0); }
-};
-
-template <typename Scalar>
-struct imag_ref_impl
-    : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
-
-template <typename Scalar>
-struct imag_ref_retval {
-  typedef typename NumTraits<Scalar>::Real& type;
-};
-
-/****************************************************************************
-* Implementation of conj                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_default_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) { return x; }
-};
-
-template <typename Scalar>
-struct conj_default_impl<Scalar, true> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    using std::conj;
-    return conj(x);
-  }
-};
-
-template <typename Scalar>
-struct conj_impl : conj_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct conj_impl<std::complex<T>> {
-  EIGEN_DEVICE_FUNC
-  static inline std::complex<T> run(const std::complex<T>& x) {
-    return std::complex<T>(x.real(), -x.imag());
-  }
-};
-#endif
-
-template <typename Scalar>
-struct conj_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of abs2                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct abs2_impl_default {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) { return x * x; }
-};
-
-template <typename Scalar>
-struct abs2_impl_default<Scalar, true>  // IsComplex
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return x.real() * x.real() + x.imag() * x.imag();
-  }
-};
-
-template <typename Scalar>
-struct abs2_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return abs2_impl_default<Scalar, NumTraits<Scalar>::IsComplex>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct abs2_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of norm1                                                *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct norm1_default_impl;
-
-template <typename Scalar>
-struct norm1_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(abs);
-    return abs(x.real()) + abs(x.imag());
-  }
-};
-
-template <typename Scalar>
-struct norm1_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(abs);
-    return abs(x);
-  }
-};
-
-template <typename Scalar>
-struct norm1_impl : norm1_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
-
-template <typename Scalar>
-struct norm1_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of hypot                                                *
-****************************************************************************/
-
-template <typename Scalar>
-struct hypot_impl;
-
-template <typename Scalar>
-struct hypot_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of cast                                                 *
-****************************************************************************/
-
-template <typename OldType, typename NewType>
-struct cast_impl {
-  EIGEN_DEVICE_FUNC
-  static inline NewType run(const OldType& x) {
-    return static_cast<NewType>(x);
-  }
-};
-
-// here, for once, we're plainly returning NewType: we don't want cast to do
-// weird things.
-
-template <typename OldType, typename NewType>
-EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) {
-  return cast_impl<OldType, NewType>::run(x);
-}
-
-/****************************************************************************
-* Implementation of round                                                   *
-****************************************************************************/
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename Scalar>
-struct round_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-    EIGEN_USING_STD_MATH(round);
-    return round(x);
-  }
-};
-#else
-template <typename Scalar>
-struct round_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-    EIGEN_USING_STD_MATH(floor);
-    EIGEN_USING_STD_MATH(ceil);
-    return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
-  }
-};
-#endif
-
-template <typename Scalar>
-struct round_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of rint                                                    *
-****************************************************************************/
-
-template <typename Scalar>
-struct rint_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-#if EIGEN_HAS_CXX11_MATH
-    EIGEN_USING_STD_MATH(rint);
-#endif
-    return rint(x);
-  }
-};
-
-#if !EIGEN_HAS_CXX11_MATH
-template <>
-struct rint_impl<double> {
-  EIGEN_DEVICE_FUNC
-  static inline double run(const double& x) { return ::rint(x); }
-};
-template <>
-struct rint_impl<float> {
-  EIGEN_DEVICE_FUNC
-  static inline float run(const float& x) { return ::rintf(x); }
-};
-#endif
-
-template <typename Scalar>
-struct rint_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of arg                                                     *
-****************************************************************************/
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename Scalar>
-struct arg_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-    // HIP does not seem to have a native device side implementation for the
-    // math routine "arg"
-    using std::arg;
-#else
-    EIGEN_USING_STD_MATH(arg);
-#endif
-    return arg(x);
-  }
-};
-#else
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct arg_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
-  }
-};
-
-template <typename Scalar>
-struct arg_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(arg);
-    return arg(x);
-  }
-};
-
-template <typename Scalar>
-struct arg_impl : arg_default_impl<Scalar> {};
-#endif
-
-template <typename Scalar>
-struct arg_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of expm1                                                   *
-****************************************************************************/
-
-// This implementation is based on GSL Math's expm1.
-namespace std_fallback {
-// fallback expm1 implementation in case there is no expm1(Scalar) function in
-// namespace of Scalar,
-// or that there is no suitable std::expm1 function available. Implementation
-// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
-  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-
-  EIGEN_USING_STD_MATH(exp);
-  Scalar u = exp(x);
-  if (numext::equal_strict(u, Scalar(1))) {
-    return x;
-  }
-  Scalar um1 = u - RealScalar(1);
-  if (numext::equal_strict(um1, Scalar(-1))) {
-    return RealScalar(-1);
-  }
-
-  EIGEN_USING_STD_MATH(log);
-  Scalar logu = log(u);
-  return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
-}
-}
-
-template <typename Scalar>
-struct expm1_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-#if EIGEN_HAS_CXX11_MATH
-    using std::expm1;
-#else
-    using std_fallback::expm1;
-#endif
-    return expm1(x);
-  }
-};
-
-// Specialization for complex types that are not supported by std::expm1.
-template <typename RealScalar>
-struct expm1_impl<std::complex<RealScalar>> {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    RealScalar xr = x.real();
-    RealScalar xi = x.imag();
-    // expm1(z) = exp(z) - 1
-    //          = exp(x +  i * y) - 1
-    //          = exp(x) * (cos(y) + i * sin(y)) - 1
-    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
-    // Imag(expm1(z)) = exp(x) * sin(y)
-    // Real(expm1(z)) = exp(x) * cos(y) - 1
-    //          = exp(x) * cos(y) - 1.
-    //          = expm1(x) + exp(x) * (cos(y) - 1)
-    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
-
-    // TODO better use numext::expm1 and numext::sin (but that would require
-    // forward declarations or moving this specialization down).
-    RealScalar erm1 = expm1_impl<RealScalar>::run(xr);
-    RealScalar er = erm1 + RealScalar(1.);
-    EIGEN_USING_STD_MATH(sin);
-    RealScalar sin2 = sin(xi / RealScalar(2.));
-    sin2 = sin2 * sin2;
-    RealScalar s = sin(xi);
-    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
-    return std::complex<RealScalar>(real_part, er * s);
-  }
-};
-
-template <typename Scalar>
-struct expm1_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of log1p                                                   *
-****************************************************************************/
-
-namespace std_fallback {
-// fallback log1p implementation in case there is no log1p(Scalar) function in
-// namespace of Scalar,
-// or that there is no suitable std::log1p function available
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
-  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_USING_STD_MATH(log);
-  Scalar x1p = RealScalar(1) + x;
-  Scalar log_1p = log(x1p);
-  const bool is_small = numext::equal_strict(x1p, Scalar(1));
-  const bool is_inf = numext::equal_strict(x1p, log_1p);
-  return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
-}
-}
-
-template <typename Scalar>
-struct log1p_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-#if EIGEN_HAS_CXX11_MATH
-    using std::log1p;
-#else
-    using std_fallback::log1p;
-#endif
-    return log1p(x);
-  }
-};
-
-// Specialization for complex types that are not supported by std::log1p.
-template <typename RealScalar>
-struct log1p_impl<std::complex<RealScalar>> {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    return std_fallback::log1p(x);
-  }
-};
-
-template <typename Scalar>
-struct log1p_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of pow                                                  *
-****************************************************************************/
-
-template <typename ScalarX,
-          typename ScalarY,
-          bool IsInteger =
-              NumTraits<ScalarX>::IsInteger&& NumTraits<ScalarY>::IsInteger>
-struct pow_impl {
-  // typedef Scalar retval;
-  typedef typename ScalarBinaryOpTraits<
-      ScalarX,
-      ScalarY,
-      internal::scalar_pow_op<ScalarX, ScalarY>>::ReturnType result_type;
-  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x,
-                                                  const ScalarY& y) {
-    EIGEN_USING_STD_MATH(pow);
-    return pow(x, y);
-  }
-};
-
-template <typename ScalarX, typename ScalarY>
-struct pow_impl<ScalarX, ScalarY, true> {
-  typedef ScalarX result_type;
-  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y) {
-    ScalarX res(1);
-    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
-    if (y & 1) res *= x;
-    y >>= 1;
-    while (y) {
-      x *= x;
-      if (y & 1) res *= x;
-      y >>= 1;
-    }
-    return res;
-  }
-};
-
-/****************************************************************************
-* Implementation of random                                               *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex, bool IsInteger>
-struct random_default_impl {};
-
-template <typename Scalar>
-struct random_impl : random_default_impl<Scalar,
-                                         NumTraits<Scalar>::IsComplex,
-                                         NumTraits<Scalar>::IsInteger> {};
-
-template <typename Scalar>
-struct random_retval {
-  typedef Scalar type;
-};
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
-    random(const Scalar& x, const Scalar& y);
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
-
-template <typename Scalar>
-struct random_default_impl<Scalar, false, false> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    return x + (y - x) * Scalar(std::rand()) / Scalar(RAND_MAX);
-  }
-  static inline Scalar run() {
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
-  }
-};
-
-enum {
-  meta_floor_log2_terminate,
-  meta_floor_log2_move_up,
-  meta_floor_log2_move_down,
-  meta_floor_log2_bogus
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2_selector {
-  enum {
-    middle = (lower + upper) / 2,
-    value = (upper <= lower + 1)
-                ? int(meta_floor_log2_terminate)
-                : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
-                                      : (n == 0) ? int(meta_floor_log2_bogus)
-                                                 : int(meta_floor_log2_move_up)
-  };
-};
-
-template <unsigned int n,
-          int lower = 0,
-          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-          int selector = meta_floor_log2_selector<n, lower, upper>::value>
-struct meta_floor_log2 {};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down> {
-  enum {
-    value = meta_floor_log2<
-        n,
-        lower,
-        meta_floor_log2_selector<n, lower, upper>::middle>::value
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up> {
-  enum {
-    value = meta_floor_log2<n,
-                            meta_floor_log2_selector<n, lower, upper>::middle,
-                            upper>::value
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate> {
-  enum {
-    value = (n >= ((unsigned int)(1) << (lower + 1))) ? lower + 1 : lower
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus> {
-  // no value, error at compile time
-};
-
-template <typename Scalar>
-struct random_default_impl<Scalar, false, true> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    if (y <= x) return x;
-    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
-    typedef typename make_unsigned<Scalar>::type ScalarU;
-    // ScalarX is the widest of ScalarU and unsigned int.
-    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
-    // types and arithmetic and signed overflows (which are undefined behavior).
-    typedef typename conditional<(ScalarU(-1) > unsigned(-1)),
-                                 ScalarU,
-                                 unsigned>::type ScalarX;
-    // The following difference doesn't overflow, provided our integer types are
-    // two's
-    // complement and have the same number of padding bits in signed and
-    // unsigned variants.
-    // This is the case in most modern implementations of C++.
-    ScalarX range = ScalarX(y) - ScalarX(x);
-    ScalarX offset = 0;
-    ScalarX divisor = 1;
-    ScalarX multiplier = 1;
-    const unsigned rand_max = RAND_MAX;
-    if (range <= rand_max)
-      divisor = (rand_max + 1) / (range + 1);
-    else
-      multiplier = 1 + range / (rand_max + 1);
-    // Rejection sampling.
-    do {
-      offset = (unsigned(std::rand()) * multiplier) / divisor;
-    } while (offset > range);
-    return Scalar(ScalarX(x) + offset);
-  }
-
-  static inline Scalar run() {
-#ifdef EIGEN_MAKING_DOCS
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
-#else
-    enum {
-        rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value,
-        scalar_bits = sizeof(Scalar) * CHAR_BIT,
-        shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
-        offset = NumTraits<Scalar>::IsSigned
-                     ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits, scalar_bits) - 1))
-                     : 0};
-    return Scalar((std::rand() >> shift) - offset);
-#endif
-  }
-};
-
-template <typename Scalar>
-struct random_default_impl<Scalar, true, false> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    return Scalar(random(x.real(), y.real()), random(x.imag(), y.imag()));
-  }
-  static inline Scalar run() {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    return Scalar(random<RealScalar>(), random<RealScalar>());
-  }
-};
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
-    random(const Scalar& x, const Scalar& y) {
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
-}
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() {
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
-}
-
-// Implementation of is* functions
-
-// std::is* do not work with fast-math and gcc, std::is* are available on MSVC
-// 2013 and newer, as well as in clang.
-#if (EIGEN_HAS_CXX11_MATH &&                               \
-     !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || \
-    (EIGEN_COMP_MSVC >= 1800) || (EIGEN_COMP_CLANG)
-#define EIGEN_USE_STD_FPCLASSIFY 1
-#else
-#define EIGEN_USE_STD_FPCLASSIFY 0
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isnan_impl(const T&) {
-  return false;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isinf_impl(const T&) {
-  return false;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isfinite_impl(const T&) {
-  return true;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isfinite_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isfinite)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isfinite;
-  return isfinite EIGEN_NOT_A_MACRO(x);
-#else
-  return x <= NumTraits<T>::highest() && x >= NumTraits<T>::lowest();
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isinf_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isinf)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isinf;
-  return isinf EIGEN_NOT_A_MACRO(x);
-#else
-  return x > NumTraits<T>::highest() || x < NumTraits<T>::lowest();
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isnan_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isnan)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isnan;
-  return isnan EIGEN_NOT_A_MACRO(x);
-#else
-  return x != x;
-#endif
-}
-
-#if (!EIGEN_USE_STD_FPCLASSIFY)
-
-#if EIGEN_COMP_MSVC
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) {
-  return _fpclass(x) == _FPCLASS_NINF || _fpclass(x) == _FPCLASS_PINF;
-}
-
-// MSVC defines a _isnan builtin function, but for double only
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) {
-  return _isnan(x) != 0;
-}
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) {
-  return _isnan(x) != 0;
-}
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) {
-  return _isnan(x) != 0;
-}
-
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) {
-  return isinf_msvc_helper(x);
-}
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) {
-  return isinf_msvc_helper(x);
-}
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) {
-  return isinf_msvc_helper(x);
-}
-
-#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
-
-#if EIGEN_GNUC_AT_LEAST(5, 0)
-#define EIGEN_TMP_NOOPT_ATTRIB \
-  EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
-#else
-// NOTE the inline qualifier and noinline attribute are both needed: the former
-// is to avoid linking issue (duplicate symbol),
-//      while the second prevent too aggressive optimizations in fast-math mode:
-#define EIGEN_TMP_NOOPT_ATTRIB \
-  EIGEN_DEVICE_FUNC inline     \
-      __attribute__((noinline, optimize("no-finite-math-only")))
-#endif
-
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) {
-  return __builtin_isinf(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) {
-  return __builtin_isinf(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) {
-  return __builtin_isinf(x);
-}
-
-#undef EIGEN_TMP_NOOPT_ATTRIB
-
-#endif
-
-#endif
-
-// The following overload are defined at the end of this file
-template <typename T>
-EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
-template <typename T>
-EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
-
-template <typename T>
-T generic_fast_tanh_float(const T& a_x);
-}  // end namespace internal
-
-/****************************************************************************
-* Generic math functions                                                    *
-****************************************************************************/
-
-namespace numext {
-
-#if (!defined(EIGEN_GPUCC))
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
-  EIGEN_USING_STD_MATH(min);
-  return min EIGEN_NOT_A_MACRO(x, y);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
-  EIGEN_USING_STD_MATH(max);
-  return max EIGEN_NOT_A_MACRO(x, y);
-}
-#else
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
-  return y < x ? y : x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float mini(const float& x,
-                                                 const float& y) {
-  return fminf(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double mini(const double& x,
-                                                  const double& y) {
-  return fmin(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x,
-                                                       const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fminl" on HIP yet
-  return (x < y) ? x : y;
-#else
-  return fminl(x, y);
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
-  return x < y ? y : x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float maxi(const float& x,
-                                                 const float& y) {
-  return fmaxf(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double maxi(const double& x,
-                                                  const double& y) {
-  return fmax(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x,
-                                                       const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fmaxl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fmaxl(x, y);
-#endif
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC)  \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC)  \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(                \
-    NAME, FUNC, RET_TYPE)                                                  \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
-
-#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE)     \
-  template <>                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
-    return cl::sycl::FUNC(x);                                              \
-  }
-
-#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(                                   \
-    NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2)                             \
-  template <>                                                               \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x,   \
-                                                      const ARG_TYPE2& y) { \
-    return cl::sycl::FUNC(x, y);                                            \
-  }
-
-#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
-  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
-
-#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
-
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar)
-    real(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
-    EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)>::type
-real_ref(const Scalar& x) {
-  return internal::real_ref_impl<Scalar>::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)
-    real_ref(Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar)
-    imag(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(arg, Scalar)
-    arg(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
-    EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)>::type
-imag_ref(const Scalar& x) {
-  return internal::imag_ref_impl<Scalar>::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)
-    imag_ref(Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(conj, Scalar)
-    conj(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar)
-    abs2(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
-}
-
-EIGEN_DEVICE_FUNC
-inline bool abs2(bool x) { return x; }
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) {
-  return x > y ? x - y : y - x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float absdiff(const float& x,
-                                                    const float& y) {
-  return fabsf(x - y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double absdiff(const double& x,
-                                                     const double& y) {
-  return fabs(x - y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(
-    const long double& x, const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fabsl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fabsl(x - y);
-#endif
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar)
-    norm1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar)
-    hypot(const Scalar& x, const Scalar& y) {
-  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar)
-    log1p(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float& x) {
-  return ::log1pf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log1p(const double& x) {
-  return ::log1p(x);
-}
-#endif
-
-template <typename ScalarX, typename ScalarY>
-EIGEN_DEVICE_FUNC inline
-    typename internal::pow_impl<ScalarX, ScalarY>::result_type
-    pow(const ScalarX& x, const ScalarY& y) {
-  return internal::pow_impl<ScalarX, ScalarY>::run(x, y);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isnan)(const T& x) {
-  return internal::isnan_impl(x);
-}
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isinf)(const T& x) {
-  return internal::isinf_impl(x);
-}
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isfinite)(const T& x) {
-  return internal::isfinite_impl(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(rint, Scalar)
-    rint(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar)
-    round(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC T(floor)(const T& x) {
-  EIGEN_USING_STD_MATH(floor);
-  return floor(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float& x) {
-  return ::floorf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double& x) {
-  return ::floor(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC T(ceil)(const T& x) {
-  EIGEN_USING_STD_MATH(ceil);
-  return ceil(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float& x) {
-  return ::ceilf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double ceil(const double& x) {
-  return ::ceil(x);
-}
-#endif
-
-/** Log base 2 for 32 bits positive integers.
-  * Conveniently returns 0 for x==0. */
-inline int log2(int x) {
-  eigen_assert(x >= 0);
-  unsigned int v(x);
-  static const int table[32] = {0,  9,  1,  10, 13, 21, 2,  29, 11, 14, 16,
-                                18, 22, 25, 3,  30, 8,  12, 20, 28, 15, 17,
-                                24, 7,  19, 27, 23, 6,  26, 5,  4,  31};
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  return table[(v * 0x07C4ACDDU) >> 27];
-}
-
-/** \returns the square root of \a x.
-  *
-  * It is essentially equivalent to
-  * \code using std::sqrt; return sqrt(x); \endcode
-  * but slightly faster for float/double and some compilers (e.g., gcc), thanks
- * to
-  * specializations when SSE is enabled.
-  *
-  * It's usage is justified in performance critical functions, like
- * norm/normalize.
-  */
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sqrt(const T& x) {
-  EIGEN_USING_STD_MATH(sqrt);
-  return sqrt(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T& x) {
-  EIGEN_USING_STD_MATH(log);
-  return log(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float& x) {
-  return ::logf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) {
-  return ::log(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename internal::enable_if<NumTraits<T>::IsSigned ||
-                                     NumTraits<T>::IsComplex,
-                                 typename NumTraits<T>::Real>::type
-    abs(const T& x) {
-  EIGEN_USING_STD_MATH(abs);
-  return abs(x);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename internal::enable_if<!(NumTraits<T>::IsSigned ||
-                                   NumTraits<T>::IsComplex),
-                                 typename NumTraits<T>::Real>::type
-    abs(const T& x) {
-  return x;
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float& x) {
-  return ::fabsf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const double& x) {
-  return ::fabs(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const std::complex<float>& x) {
-  return ::hypotf(x.real(), x.imag());
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(
-    const std::complex<double>& x) {
-  return ::hypot(x.real(), x.imag());
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T& x) {
-  EIGEN_USING_STD_MATH(exp);
-  return exp(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float& x) {
-  return ::expf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double& x) {
-  return ::exp(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
-    const std::complex<float>& x) {
-  float com = ::expf(x.real());
-  float res_real = com * ::cosf(x.imag());
-  float res_imag = com * ::sinf(x.imag());
-  return std::complex<float>(res_real, res_imag);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
-    const std::complex<double>& x) {
-  double com = ::exp(x.real());
-  double res_real = com * ::cos(x.imag());
-  double res_imag = com * ::sin(x.imag());
-  return std::complex<double>(res_real, res_imag);
-}
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar)
-    expm1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float& x) {
-  return ::expm1f(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double expm1(const double& x) {
-  return ::expm1(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T& x) {
-  EIGEN_USING_STD_MATH(cos);
-  return cos(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos, cos)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float& x) {
-  return ::cosf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cos(const double& x) {
-  return ::cos(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T& x) {
-  EIGEN_USING_STD_MATH(sin);
-  return sin(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float& x) {
-  return ::sinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sin(const double& x) {
-  return ::sin(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T& x) {
-  EIGEN_USING_STD_MATH(tan);
-  return tan(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float& x) {
-  return ::tanf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double& x) {
-  return ::tan(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T& x) {
-  EIGEN_USING_STD_MATH(acos);
-  return acos(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T& x) {
-  EIGEN_USING_STD_MATH(acosh);
-  return acosh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float& x) {
-  return ::acosf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double acos(const double& x) {
-  return ::acos(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T& x) {
-  EIGEN_USING_STD_MATH(asin);
-  return asin(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T& x) {
-  EIGEN_USING_STD_MATH(asinh);
-  return asinh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float& x) {
-  return ::asinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double asin(const double& x) {
-  return ::asin(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T& x) {
-  EIGEN_USING_STD_MATH(atan);
-  return atan(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T& x) {
-  EIGEN_USING_STD_MATH(atanh);
-  return atanh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float& x) {
-  return ::atanf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double atan(const double& x) {
-  return ::atan(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T& x) {
-  EIGEN_USING_STD_MATH(cosh);
-  return cosh(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float& x) {
-  return ::coshf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cosh(const double& x) {
-  return ::cosh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T& x) {
-  EIGEN_USING_STD_MATH(sinh);
-  return sinh(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float& x) {
-  return ::sinhf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sinh(const double& x) {
-  return ::sinh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T& x) {
-  EIGEN_USING_STD_MATH(tanh);
-  return tanh(x);
-}
-
-#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) {
-  return internal::generic_fast_tanh_float(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float& x) {
-  return ::tanhf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tanh(const double& x) {
-  return ::tanh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(fmod);
-  return fmod(a, b);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a,
-                                                 const float& b) {
-  return ::fmodf(a, b);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a,
-                                                  const double& b) {
-  return ::fmod(a, b);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
-#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
-#undef SYCL_SPECIALIZE_UNARY_FUNC
-#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
-#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
-#undef SYCL_SPECIALIZE_BINARY_FUNC
-#endif
-
-}  // end namespace numext
-
-namespace internal {
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x) {
-  return (numext::isfinite)(numext::real(x)) &&
-         (numext::isfinite)(numext::imag(x));
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x) {
-  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x) {
-  return ((numext::isinf)(numext::real(x)) ||
-          (numext::isinf)(numext::imag(x))) &&
-         (!(numext::isnan)(x));
-}
-
-/****************************************************************************
-* Implementation of fuzzy comparisons                                       *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex, bool IsInteger>
-struct scalar_fuzzy_default_impl {};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, false, false> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
-      const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
-    return numext::abs(x) <= numext::abs(y) * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar& prec) {
-    return numext::abs(x - y) <=
-           numext::mini(numext::abs(x), numext::abs(y)) * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const Scalar& x,
-                                        const Scalar& y,
-                                        const RealScalar& prec) {
-    return x <= y || isApprox(x, y, prec);
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, false, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x,
-                                                         const Scalar&,
-                                                         const RealScalar&) {
-    return x == Scalar(0);
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar&) {
-    return x == y;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const Scalar& x,
-                                        const Scalar& y,
-                                        const RealScalar&) {
-    return x <= y;
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, true, false> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
-      const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
-    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar& prec) {
-    return numext::abs2(x - y) <=
-           numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_impl
-    : scalar_fuzzy_default_impl<Scalar,
-                                NumTraits<Scalar>::IsComplex,
-                                NumTraits<Scalar>::IsInteger> {};
-
-template <typename Scalar, typename OtherScalar>
-EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(
-    const Scalar& x,
-    const OtherScalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(
-      x, y, precision);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline bool isApprox(
-    const Scalar& x,
-    const Scalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(
-    const Scalar& x,
-    const Scalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
-}
-
-/******************************************
-***  The special case of the  bool type ***
-******************************************/
-
-template <>
-struct random_impl<bool> {
-  static inline bool run() { return random<int>(0, 1) == 0 ? false : true; }
-};
-
-template <>
-struct scalar_fuzzy_impl<bool> {
-  typedef bool RealScalar;
-
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x,
-                                                         const bool&,
-                                                         const bool&) {
-    return !x;
-  }
-
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(bool x, bool y, bool) { return x == y; }
-
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const bool& x,
-                                        const bool& y,
-                                        const bool&) {
-    return (!x) || y;
-  }
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_MATHFUNCTIONS_H
diff --git a/patches/eigen/Meta.h b/patches/eigen/Meta.h
index d7f5cbd240a4ab..b7b789a19c4e9a 100755
--- a/patches/eigen/Meta.h
+++ b/patches/eigen/Meta.h
@@ -8,8 +8,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// clang-format off
-
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
@@ -27,8 +25,40 @@
 
 #endif
 
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+// Recent versions of ICC require <cstdint> for pointer types below.
+#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
+
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
 #include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t  uint8_t;
+typedef std::int8_t   int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t  int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provide the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint8_t  uint8_t;
+typedef ::int8_t   int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t  int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
 #endif
 
 namespace Eigen {
@@ -54,13 +84,14 @@ namespace internal {
 
 // Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
 // and older versions do not provide *intptr_t types.
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+#if EIGEN_ICC_NEEDS_CSTDINT
 typedef std::intptr_t  IntPtr;
 typedef std::uintptr_t UIntPtr;
 #else
 typedef std::ptrdiff_t IntPtr;
 typedef std::size_t UIntPtr;
 #endif
+#undef EIGEN_ICC_NEEDS_CSTDINT
 
 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
@@ -162,6 +193,16 @@ template<> struct make_unsigned<unsigned long>    { typedef unsigned long type;
 template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
 template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
 #endif
+
+// Some platforms define int64_t as long long even for C++03. In this case we
+// are missing the definition for make_unsigned. If we just define it, we get
+// duplicated definitions for platforms defining int64_t as signed long for
+// C++03. We therefore add the specialization for C++03 long long for these
+// platforms only.
+#if EIGEN_OS_MAC
+template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
+#endif
 #endif
 
 template <typename T> struct add_const { typedef const T type; };
@@ -360,6 +401,15 @@ template<> struct numeric_limits<unsigned long long>
   EIGEN_DEVICE_FUNC
   static unsigned long long (min)() { return 0; }
 };
+template<> struct numeric_limits<bool>
+{
+  EIGEN_DEVICE_FUNC
+  static bool epsilon() { return false; }
+  EIGEN_DEVICE_FUNC
+  static bool (max)() { return true; }
+  EIGEN_DEVICE_FUNC
+  static bool (min)() { return false; }
+};
 
 }
 
@@ -431,13 +481,29 @@ template<typename T,std::size_t N>
 Index size(const T (&) [N]) { return N; }
 
 /** \internal
-  * Convenient struct to get the result type of a unary or binary functor.
-  *
-  * It supports both the current STL mechanism (using the result_type member) as well as
-  * upcoming next STL generation (using a templated result member).
-  * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
+  * Convenient struct to get the result type of a nullary, unary, binary, or
+  * ternary functor.
+  * 
+  * Pre C++11:
+  * Supports both a Func::result_type member and templated
+  * Func::result<Func(ArgTypes...)>::type member.
+  * 
+  * If none of these members is provided, then the type of the first
+  * argument is returned.
+  * 
+  * Post C++11:
+  * This uses std::result_of. However, note the `type` member removes
+  * const and converts references/pointers to their corresponding value type.
   */
-#if EIGEN_HAS_STD_RESULT_OF
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename T> struct result_of;
+
+template<typename F, typename... ArgTypes>
+struct result_of<F(ArgTypes...)> {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_STD_RESULT_OF
 template<typename T> struct result_of {
   typedef typename std::result_of<T>::type type1;
   typedef typename remove_all<type1>::type type;
@@ -449,6 +515,28 @@ struct has_none {int a[1];};
 struct has_std_result_type {int a[2];};
 struct has_tr1_result {int a[3];};
 
+template<typename Func, int SizeOf>
+struct nullary_result_of_select {};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_tr1_result)> {typedef typename Func::template result<Func()>::type type;};
+
+template<typename Func>
+struct result_of<Func()> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T()>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename nullary_result_of_select<Func, FunctorType>::type type;
+};
+
 template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
 struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
 
@@ -518,6 +606,45 @@ struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
     typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
 };
+
+#endif
+
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_CXX11
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename result_of<F(ArgTypes...)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename F, typename ArgType0 = void, typename ArgType1 = void, typename ArgType2 = void>
+struct invoke_result {
+  typedef typename result_of<F(ArgType0, ArgType1, ArgType2)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F>
+struct invoke_result<F, void, void, void> {
+  typedef typename result_of<F()>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0>
+struct invoke_result<F, ArgType0, void, void> {
+  typedef typename result_of<F(ArgType0)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0, typename ArgType1>
+struct invoke_result<F, ArgType0, ArgType1, void> {
+  typedef typename result_of<F(ArgType0, ArgType1)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
 #endif
 
 struct meta_yes { char a[1]; };
@@ -672,51 +799,8 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
 #endif
 
-/** \internal extract the bits of the float \a x */
-inline unsigned int as_uint(float x)
-{
-  unsigned int ret;
-  std::memcpy(&ret, &x, sizeof(float));
-  return ret;
-}
-
 } // end namespace numext
 
 } // end namespace Eigen
 
-// Define portable (u)int{32,64} types
-#if EIGEN_HAS_CXX11
-#include <cstdint>
-namespace Eigen {
-namespace numext {
-typedef std::uint8_t  uint8_t;
-typedef std::int8_t   int8_t;
-typedef std::uint16_t uint16_t;
-typedef std::int16_t  int16_t;
-typedef std::uint32_t uint32_t;
-typedef std::int32_t  int32_t;
-typedef std::uint64_t uint64_t;
-typedef std::int64_t  int64_t;
-}
-}
-#else
-// Without c++11, all compilers able to compile Eigen also
-// provides the C99 stdint.h header file.
-#include <stdint.h>
-namespace Eigen {
-namespace numext {
-typedef ::uint8_t  uint8_t;
-typedef ::int8_t   int8_t;
-typedef ::uint16_t uint16_t;
-typedef ::int16_t  int16_t;
-typedef ::uint32_t uint32_t;
-typedef ::int32_t  int32_t;
-typedef ::uint64_t uint64_t;
-typedef ::int64_t  int64_t;
-}
-}
-#endif
-
 #endif // EIGEN_META_H
-
-// clang-format on
diff --git a/patches/eigen/Tensor b/patches/eigen/Tensor
deleted file mode 100644
index 1f1016f9b443c0..00000000000000
--- a/patches/eigen/Tensor
+++ /dev/null
@@ -1,156 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-//#ifndef EIGEN_CXX11_TENSOR_MODULE
-//#define EIGEN_CXX11_TENSOR_MODULE
-
-#include "../../../Eigen/Core"
-
-#if EIGEN_HAS_CXX11
-
-#include "../SpecialFunctions"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
-#include "src/util/CXX11Meta.h"
-#include "src/util/MaxSizeVector.h"
-
-/** \defgroup CXX11_Tensor_Module Tensor Module
-  *
-  * This module provides a Tensor class for storing arbitrarily indexed
-  * objects.
-  *
-  * \code
-  * #include <Eigen/CXX11/Tensor>
-  * \endcode
-  *
-  * Much of the documentation can be found \ref eigen_tensors "here".
-  */
-
-#include <cmath>
-#include <cstddef>
-#include <cstring>
-#include <random>
-
-#ifdef _WIN32
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#include <windows.h>
-#else
-#include <stdint.h>
-#include <unistd.h>
-#endif
-
-#ifdef _WIN32
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
-#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
-#include "ThreadPool"
-#endif
-
-#ifdef EIGEN_USE_GPU
-  #include <iostream>
-  #if defined(EIGEN_USE_HIP)
-    #include <hip/hip_runtime.h>
-  #else
-    #include <cuda_runtime.h>
-  #endif
-  #include <atomic>
-#endif
-
-#include "src/Tensor/TensorMacros.h"
-#include "src/Tensor/TensorForwardDeclarations.h"
-#include "src/Tensor/TensorMeta.h"
-#include "src/Tensor/TensorFunctors.h"
-#include "src/Tensor/TensorCostModel.h"
-#include "src/Tensor/TensorDeviceDefault.h"
-#include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceGpu.h"
-#ifndef gpu_assert
-#define gpu_assert(x)
-#endif
-#include "src/Tensor/TensorDeviceSycl.h"
-#include "src/Tensor/TensorIndexList.h"
-#include "src/Tensor/TensorDimensionList.h"
-#include "src/Tensor/TensorDimensions.h"
-#include "src/Tensor/TensorInitializer.h"
-#include "src/Tensor/TensorTraits.h"
-#include "src/Tensor/TensorRandom.h"
-#include "src/Tensor/TensorUInt128.h"
-#include "src/Tensor/TensorIntDiv.h"
-#include "src/Tensor/TensorGlobalFunctions.h"
-
-#include "src/Tensor/TensorBase.h"
-#include "src/Tensor/TensorBlock.h"
-
-#include "src/Tensor/TensorEvaluator.h"
-#include "src/Tensor/TensorExpr.h"
-#include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionGpu.h"
-#include "src/Tensor/TensorArgMax.h"
-#include "src/Tensor/TensorConcatenation.h"
-#include "src/Tensor/TensorContractionMapper.h"
-#include "src/Tensor/TensorContractionBlocking.h"
-#include "src/Tensor/TensorContraction.h"
-#include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionGpu.h"
-#include "src/Tensor/TensorConversion.h"
-#include "src/Tensor/TensorConvolution.h"
-#include "src/Tensor/TensorFFT.h"
-#include "src/Tensor/TensorPatch.h"
-#include "src/Tensor/TensorImagePatch.h"
-#include "src/Tensor/TensorVolumePatch.h"
-#include "src/Tensor/TensorBroadcasting.h"
-#include "src/Tensor/TensorChipping.h"
-#include "src/Tensor/TensorInflation.h"
-#include "src/Tensor/TensorLayoutSwap.h"
-#include "src/Tensor/TensorMorphing.h"
-#include "src/Tensor/TensorPadding.h"
-#include "src/Tensor/TensorReverse.h"
-#include "src/Tensor/TensorShuffling.h"
-#include "src/Tensor/TensorStriding.h"
-#include "src/Tensor/TensorCustomOp.h"
-#include "src/Tensor/TensorEvalTo.h"
-#include "src/Tensor/TensorForcedEval.h"
-#include "src/Tensor/TensorGenerator.h"
-#include "src/Tensor/TensorAssign.h"
-#include "src/Tensor/TensorScan.h"
-#include "src/Tensor/TensorTrace.h"
-
-#ifdef EIGEN_USE_SYCL
-#include "src/Tensor/TensorReductionSycl.h"
-#include "src/Tensor/TensorConvolutionSycl.h"
-#include "src/Tensor/TensorContractionSycl.h"
-#include "src/Tensor/TensorScanSycl.h"
-#endif
-
-#include "src/Tensor/TensorExecutor.h"
-#include "src/Tensor/TensorDevice.h"
-
-#include "src/Tensor/TensorStorage.h"
-#include "src/Tensor/Tensor.h"
-#include "src/Tensor/TensorFixedSize.h"
-#include "src/Tensor/TensorMap.h"
-#include "src/Tensor/TensorRef.h"
-
-#include "src/Tensor/TensorIO.h"
-
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif  // EIGEN_HAS_CXX11
-//#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/patches/eigen/TensorBlock.h b/patches/eigen/TensorBlock.h
deleted file mode 100644
index 1e55d12c42fc2e..00000000000000
--- a/patches/eigen/TensorBlock.h
+++ /dev/null
@@ -1,1559 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-
-namespace Eigen {
-namespace internal {
-
-// -------------------------------------------------------------------------- //
-// Forward declarations for templates defined below.
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO;
-
-// -------------------------------------------------------------------------- //
-// Helper function to compute strides for densely stored buffer of given
-// dimensions.
-
-// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
-// this function instead everywhere.
-template <int Layout, typename IndexType, int NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const DSizes<IndexType, NumDims>& dimensions) {
-  DSizes<IndexType, NumDims> strides;
-  if (NumDims == 0) return strides;
-
-  // TODO(ezhulenev): Use templates to unroll this loop (similar to
-  // h_array_reduce in CXX11meta.h)? Benchmark it.
-  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-    strides[0] = 1;
-    for (int i = 1; i < NumDims; ++i) {
-      strides[i] = strides[i - 1] * dimensions[i - 1];
-    }
-  } else {
-    strides[NumDims - 1] = 1;
-    for (int i = NumDims - 2; i >= 0; --i) {
-      strides[i] = strides[i + 1] * dimensions[i + 1];
-    }
-  }
-
-  return strides;
-}
-
-template <int Layout, typename IndexType, size_t NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const Eigen::array<IndexType, NumDims>& dimensions) {
-  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
-}
-
-template <int Layout, std::ptrdiff_t... Indices>
-EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
-    const Sizes<Indices...>& sizes) {
-  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
-}
-
-// -------------------------------------------------------------------------- //
-
-// Tensor block shape type defines what are the shape preference for the blocks
-// extracted from the larger tensor.
-//
-// Example: blocks of 100 elements from the large 100x100 tensor:
-// - tensor: 100x100
-// - target_block_size: 100
-//
-// TensorBlockShapeType:
-//  - kUniformAllDims: 100 blocks of size 10x10
-//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
-//                      or row major layout)
-enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
-
-struct TensorBlockResourceRequirements {
-  TensorBlockShapeType shape_type;  // target block shape
-  size_t size;                      // target block size
-  TensorOpCost cost_per_coeff;      // cost of computing a single block element
-
-#ifdef EIGEN_HIPCC
-  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
-  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
-  // errors out complaining about the lack of a matching constructor
-  EIGEN_DEVICE_FUNC
-  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
-				  TensorOpCost cost_)
-    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
-  {}
-#endif
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes,
-      TensorOpCost cost) {
-    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
-    return {shape_type, size, cost};
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes) {
-    // This default cost per coefficient is valid for most materialized tensor
-    // block evaluation implementations, because they typically just read
-    // coefficients from the underlying tensor storage, and write to the tensor
-    // block buffer (scratch or destination memory, reads and writes have linear
-    // access pattern). We ignore the fixed cost of block evaluation, because in
-    // practice it should negligible.
-    //
-    // Lazy block evaluation adds the cost of calling a functor for each
-    // coefficient.
-    //
-    // All non-trivial block evaluation implementations must provide their own
-    // cost approximation (e.g. shuffling inner dimension has a much higher cost
-    // because it reads memory randomly, although the total number of moved
-    // bytes is the same).
-    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
-                                    {/*bytes_loaded=*/sizeof(Scalar),
-                                     /*bytes_stored=*/sizeof(Scalar),
-                                     /*compute_cycles=*/0});
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
-                                    size_in_bytes);
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
-                                    size_in_bytes);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
-  merge(const TensorBlockResourceRequirements& lhs,
-        const TensorBlockResourceRequirements& rhs) {
-    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
-            merge(lhs.size, rhs.size),                       // size
-            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
-  }
-
-  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
-      TensorOpCost cost) {
-    cost_per_coeff += cost;
-    return *this;
-  }
-
-  // This is a resource requirement that should be returned from expressions
-  // that do not have any block evaluation preference (e.g. default tensor
-  // expression with raw buffer access).
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
-    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
-  }
-
- private:
-  using Requirements = TensorBlockResourceRequirements;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
-    return numext::maxi(lhs_size, rhs_size);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockShapeType
-  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
-    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
-            rhs == TensorBlockShapeType::kSkewedInnerDims)
-               ? TensorBlockShapeType::kSkewedInnerDims
-               : TensorBlockShapeType::kUniformAllDims;
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
-                                                TensorOpCost rhs_cost) {
-    return lhs_cost + rhs_cost;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockDescriptor specifies a block offset within a tensor and the block
-// sizes along each of the tensor dimensions.
-
-template <int NumDims, typename IndexType = Eigen::Index>
-class TensorBlockDescriptor {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  // If we evaluate a Tensor assignment, and expression on the left, already has
-  // a memory buffer, then we might do performance optimization, and evaluate
-  // the root expression directly into the final output memory. Some time it's
-  // possible to reuse it for materializing subexpressions inside an expression
-  // tree, to to avoid dynamic memory allocation.
-  //
-  // The pointer type of the underlying storage is erased, because passing
-  // Scalar type through all the expression evaluation layers is way too many
-  // templates. In practice destination buffer type should always match the
-  // evaluated expression scalar type.
-  class DestinationBuffer {
-   public:
-    enum DestinationBufferKind : int {
-      // The above explicit specification of "int" as the enum basetype is
-      // needed to get around a HIPCC link error ("the field type is not
-      // amp-compatible")
-      // which is issued for class members with the enum type.
-      // TODO(rocm):
-      // remove the "int" basetype once HIPCC has been fixed to not error out
-      // in the above scenario.
-
-      // Destination buffer is not defined (`m_data` == nullptr).
-      kEmpty,
-
-      // Tensor block defined by an owning tensor block descriptor can fit
-      // contiguously into the destination buffer. In this case it's safe to
-      // materialize tensor block in the destination buffer, wrap it in a
-      // TensorMap, and use to build Eigen expression on top of it.
-      kContiguous,
-
-      // Destination buffer strides do not match strides of the contiguously
-      // stored block, and it's impossible to define a TensorMap over this
-      // buffer. However if we are evaluating a root of an expression tree, we
-      // still can materialize an output into this destination, because we can
-      // guarantee that no one will ever access it through block API.
-      //
-      // In theory it is possible to build valid TensorStriding<TensorMap>
-      // expression on top of this destination buffer, however it has
-      // inefficient coeff/packet access, and defeats the purpose of fast block
-      // evaluation API.
-      kStrided
-    };
-
-    template <typename Scalar>
-    Scalar* data() const {
-      eigen_assert(m_data_type_size == sizeof(Scalar));
-      return static_cast<Scalar*>(m_data);
-    }
-
-    const Dimensions& strides() const { return m_strides; }
-    const DestinationBufferKind& kind() const { return m_kind; }
-
-   private:
-    friend class TensorBlockDescriptor;
-
-    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
-
-    template <typename Scalar>
-    DestinationBuffer(Scalar* data, const Dimensions& strides,
-                      DestinationBufferKind kind)
-        : m_data(static_cast<void*>(data)),
-          m_data_type_size(sizeof(Scalar)),
-          m_strides(strides),
-          m_kind(kind) {}
-
-    template <int Layout, typename Scalar>
-    static DestinationBuffer make(const TensorBlockDescriptor& desc,
-                                  Scalar* data, const Dimensions& strides) {
-      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
-    }
-
-    template <int Layout>
-    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
-                                      const Dimensions& strides) {
-      const Dimensions& desc_dims = desc.dimensions();
-      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
-      for (int i = 0; i < NumDims; ++i) {
-        if (desc_dims[i] == 1) continue;
-        if (desc_strides[i] != strides[i]) return kStrided;
-      }
-      return kContiguous;
-    }
-
-    // Storage pointer is type erased, to reduce template bloat, but we still
-    // keep the size of the underlying element type for error checking.
-    void* m_data;
-    size_t m_data_type_size;
-
-    // Destination buffer dimensions always match the dimensions of a tensor
-    // block descriptor it belongs to, however strides might be different.
-    Dimensions m_strides;
-
-    DestinationBufferKind m_kind;
-  };
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
-                        const DestinationBuffer& destination)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(destination) {}
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(DestinationBuffer()) {}
-
-  IndexType offset() const { return m_offset; }
-  const Dimensions& dimensions() const { return m_dimensions; }
-  IndexType dimension(int index) const { return m_dimensions[index]; }
-  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
-
-  const DestinationBuffer& destination() const { return m_destination; }
-
-  template <int Layout, typename Scalar>
-  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
-    eigen_assert(dst_base != NULL);
-    m_destination =
-        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
-  }
-
-  template <int Layout, typename Scalar, typename DstStridesIndexType>
-  void AddDestinationBuffer(
-      Scalar* dst_base,
-      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
-    // DSizes constructor will do index type promotion if it's safe.
-    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
-  }
-
-  TensorBlockDescriptor& DropDestinationBuffer() {
-    m_destination.m_data = NULL;
-    m_destination.m_kind = DestinationBuffer::kEmpty;
-    return *this;
-  }
-
-  bool HasDestinationBuffer() const {
-    return m_destination.kind() != DestinationBuffer::kEmpty;
-  }
-
-  // Returns a copy of `*this` with updated offset.
-  TensorBlockDescriptor WithOffset(IndexType offset) const {
-    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
-  }
-
- private:
-  // Offset and dimensions are immutable after construction. Block descriptor
-  // can only be mutated by adding or dropping destination.
-  const IndexType m_offset;
-  const Dimensions m_dimensions;
-  DestinationBuffer m_destination;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
-
-template <int NumDims, int Layout, typename IndexType = Eigen::Index>
-class TensorBlockMapper {
-  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  TensorBlockMapper() = default;
-  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
-                    const TensorBlockResourceRequirements& requirements)
-      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
-    // Compute block dimensions and the total number of blocks.
-    InitializeBlockDimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
-    return m_total_block_count;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
-    return m_block_dimensions.TotalSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
-  blockDimensions() const {
-    return m_block_dimensions;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
-  blockDescriptor(IndexType block_index) const {
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    IndexType offset = 0;
-    DSizes<IndexType, NumDims> dimensions;
-
-    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
-
-    // Iterate outer -> inner dimensions.
-    for (int i = NumDims - 1; i >= 0; --i) {
-      const int dim = isColMajor ? i : NumDims - i - 1;
-
-      const IndexType idx = block_index / m_block_strides[dim];
-      block_index -= idx * m_block_strides[dim];
-
-      const IndexType coord = idx * m_block_dimensions[dim];
-      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
-                                     m_block_dimensions[dim]);
-      offset += coord * m_tensor_strides[dim];
-    }
-
-    return {offset, dimensions};
-  }
-
- private:
-  void InitializeBlockDimensions() {
-    // Requested block shape and size.
-    const TensorBlockShapeType shape_type = m_requirements.shape_type;
-    IndexType target_block_size =
-        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
-
-    IndexType tensor_size = m_tensor_dimensions.TotalSize();
-
-    // Corner case: one of the dimensions is zero. Logic below is too complex
-    // to handle this case on a general basis, just use unit block size.
-    // Note: we must not yield blocks with zero dimensions (recipe for
-    // overflows/underflows, divisions by zero and NaNs later).
-    if (tensor_size == 0) {
-      for (int i = 0; i < NumDims; ++i) {
-        m_block_dimensions[i] = 1;
-      }
-      m_total_block_count = 0;
-      return;
-    }
-
-    // If tensor fits into a target block size, evaluate it as a single block.
-    if (tensor_size <= target_block_size) {
-      m_block_dimensions = m_tensor_dimensions;
-      m_total_block_count = 1;
-      // The only valid block index is `0`, and in this case we do not need
-      // to compute real strides for tensor or blocks (see blockDescriptor).
-      for (int i = 0; i < NumDims; ++i) {
-        m_tensor_strides[i] = 0;
-        m_block_strides[i] = 1;
-      }
-      return;
-    }
-
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    // Block shape skewed towards inner dimension.
-    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
-      IndexType coeff_to_allocate = target_block_size;
-
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-        m_block_dimensions[dim] =
-            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
-        coeff_to_allocate = divup(
-            coeff_to_allocate,
-            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
-      }
-      eigen_assert(coeff_to_allocate == 1);
-
-    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
-      // Tensor will not fit within 'target_block_size' budget: calculate tensor
-      // block dimension sizes based on "square" dimension size target.
-      const IndexType dim_size_target = convert_index<IndexType>(
-          std::pow(static_cast<float>(target_block_size),
-                   1.0f / static_cast<float>(m_block_dimensions.rank())));
-
-      for (int i = 0; i < NumDims; ++i) {
-        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
-        // a multiple of the packet size. Note that reducing
-        // 'block_dim_size' in this manner can increase the number of
-        // blocks, and so will amplify any per-block overhead.
-        m_block_dimensions[i] =
-            numext::mini(dim_size_target, m_tensor_dimensions[i]);
-      }
-
-      // Add any un-allocated coefficients to inner dimension(s).
-      IndexType total_size = m_block_dimensions.TotalSize();
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-
-        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
-          const IndexType total_size_other_dims =
-              total_size / m_block_dimensions[dim];
-          const IndexType alloc_avail =
-              divup<IndexType>(target_block_size, total_size_other_dims);
-          if (alloc_avail == m_block_dimensions[dim]) {
-            // Insufficient excess coefficients to allocate.
-            break;
-          }
-          m_block_dimensions[dim] =
-              numext::mini(m_tensor_dimensions[dim], alloc_avail);
-          total_size = total_size_other_dims * m_block_dimensions[dim];
-        }
-      }
-
-    } else {
-      eigen_assert(false);  // unknown block shape
-    }
-
-    eigen_assert(m_block_dimensions.TotalSize() >=
-                 numext::mini<IndexType>(target_block_size,
-                                         m_tensor_dimensions.TotalSize()));
-
-    // Calculate block counts by dimension and total block count.
-    DSizes<IndexType, NumDims> block_count;
-    for (int i = 0; i < NumDims; ++i) {
-      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
-    m_block_strides = strides<Layout>(block_count);
-  }
-
-  DSizes<IndexType, NumDims> m_tensor_dimensions;
-  TensorBlockResourceRequirements m_requirements;
-
-  DSizes<IndexType, NumDims> m_block_dimensions;
-  IndexType m_total_block_count;
-
-  DSizes<IndexType, NumDims> m_tensor_strides;
-  DSizes<IndexType, NumDims> m_block_strides;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockScratchAllocator is responsible for allocating temporary buffers
-// for block evaluation (output or input block materialization). Given that
-// Eigen expression traversal order is deterministic, all temporary allocations
-// are happening in the same order, and usually have exactly the same size.
-// Scratch allocator keeps a trace of all dynamic allocations, and after the
-// first block evaluation is completed, we should be able to reuse all the
-// temporary buffers for the next block evaluation.
-
-template <typename Device>
-class TensorBlockScratchAllocator {
- public:
-  explicit TensorBlockScratchAllocator(const Device& device)
-      : m_device(device), m_allocation_index(0) {}
-
-  ~TensorBlockScratchAllocator() {
-    for (size_t i = 0; i < m_allocations.size(); ++i) {
-      m_device.deallocate(m_allocations[i].ptr);
-    }
-  }
-
-  void* allocate(size_t size) {
-    // TODO(ezhulenev): Remove when replaced with inlined vector.
-    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
-
-    // Check if we already have an existing allocation att current index.
-    const int num_allocations = static_cast<int>(m_allocations.size());
-    const bool has_allocation = m_allocation_index < num_allocations;
-
-    // Allocation index can't be larger than the number of allocations.
-    eigen_assert(m_allocation_index <= num_allocations);
-
-    // If we have existing allocation, and its size is larger or equal to
-    // requested size, we do nothing.
-
-    // If current allocation can't fit requested size, we deallocate it, and
-    // replace with a larger allocation.
-    if (has_allocation && m_allocations[m_allocation_index].size < size) {
-      m_device.deallocate(m_allocations[m_allocation_index].ptr);
-      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
-      m_allocations[m_allocation_index].size = size;
-    }
-
-    // Make a new allocation if we don't have and existing one.
-    if (!has_allocation) {
-      Allocation allocation;
-      allocation.ptr = m_device.allocate(size);
-      allocation.size = size;
-      m_allocations.push_back(allocation);
-    }
-
-    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
-    eigen_assert(m_allocations[m_allocation_index].size >= size);
-
-    return m_allocations[m_allocation_index++].ptr;
-  }
-
-  void reset() { m_allocation_index = 0; }
-
- private:
-  struct Allocation {
-    void* ptr;
-    size_t size;
-  };
-
-  const Device& m_device;
-  int m_allocation_index;
-  // TODO(ezhulenev): This should be an inlined vector.
-  std::vector<Allocation> m_allocations;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockKind represents all possible block kinds, that can be produced by
-// TensorEvaluator::evalBlock function.
-enum TensorBlockKind {
-  // Tensor block that is a lazy expression that must be assigned to a
-  // destination using TensorBlockAssign.
-  kExpr,
-
-  // Tensor block that is a view into a memory buffer owned by an underlying
-  // Tensor expression (e.g. it can be a view into a Tensor buffer).
-  kView,
-
-  // Tensor block that was materialized in a scratch memory buffer, allocated
-  // with TensorBlockScratchAllocator. This block must be copied to a
-  // destination, similar to a block of `kExpr` type.
-  kMaterializedInScratch,
-
-  // Tensor block that was materialized directly into the final output memory
-  // buffer. For example if the left side of an assignment is a Tensor, we can
-  // directly materialize the block in the destination memory.
-  //
-  // If strides in the output buffer do not match tensor block strides, the
-  // Tensor expression will be invalid, and should not be used by
-  // TensorBlockAssign or for constructing another block expression.
-  kMaterializedInOutput
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
-// TensorEvaluators that do not support block evaluation.
-
-class TensorBlockNotImplemented {
- public:
-  typedef void XprType;
-};
-
-// -------------------------------------------------------------------------- //
-// XprScalar extracts Scalar type from the Eigen expressions (if expression type
-// is not void). It's required to be able to define lazy block expression for
-// argument types, that do not support block evaluation.
-
-template <typename XprType>
-struct XprScalar {
-  typedef typename XprType::Scalar type;
-};
-template <>
-struct XprScalar<void> {
-  typedef void type;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorMaterializedBlock is a fully evaluated block of the original tensor,
-// and XprType is just a TensorMap over the data. This block type is typically
-// used to materialize blocks of tensor expressions, that can't be efficiently
-// represented as lazy Tensor expressions with fast coeff/packet operations,
-// e.g. we materialize all broadcasts into evaluated blocks.
-//
-// TensorMaterializedBlock does not own its memory buffer, it's either a memory
-// buffer that backs the original expression (e.g. block is just a view into a
-// Tensor), or a memory buffer allocated with scratch allocator, and in this
-// case the scratch allocator will deallocate it at the end of block based
-// expression execution.
-//
-// If the block was evaluated directly into the output buffer, and strides in
-// the output buffer do not match block strides, the TensorMap expression will
-// be invalid, and should never be used in block assignment or any other tensor
-// expression.
-
-template <typename Scalar, int NumDims, int Layout,
-          typename IndexType = Eigen::Index>
-class TensorMaterializedBlock {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
-
-  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
-                          const Dimensions& dimensions, bool valid_expr = true)
-      : m_kind(kind),
-        m_data(data),
-        m_dimensions(dimensions),
-        m_expr(m_data, m_dimensions),
-        m_valid_expr(valid_expr) {
-    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
-  }
-
-  TensorBlockKind kind() const { return m_kind; }
-  // NOTE(ezhulenev): Returning XprType by value like in other block types
-  // causes asan failures. The theory is that XprType::Nested doesn't work
-  // properly for TensorMap.
-  const XprType& expr() const {
-    eigen_assert(m_valid_expr);
-    return m_expr;
-  }
-  const Scalar* data() const { return m_data; }
-  void cleanup() {}
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
-
-  // TensorMaterializedBlock can be backed by different types of storage:
-  //
-  //   (1) Contiguous block of memory allocated with scratch allocator.
-  //   (2) Contiguous block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //   (3) Strided block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //
-  class Storage {
-   public:
-    Scalar* data() const { return m_data; }
-    const Dimensions& dimensions() const { return m_dimensions; }
-    const Dimensions& strides() const { return m_strides; }
-
-    TensorMaterializedBlock AsTensorMaterializedBlock() const {
-      return TensorMaterializedBlock(
-          m_materialized_in_output
-              ? internal::TensorBlockKind::kMaterializedInOutput
-              : internal::TensorBlockKind::kMaterializedInScratch,
-          m_data, m_dimensions, !m_strided_storage);
-    }
-
-   private:
-    friend class TensorMaterializedBlock;
-
-    Storage(Scalar* data, const Dimensions& dimensions,
-            const Dimensions& strides, bool materialized_in_output,
-            bool strided_storage)
-        : m_data(data),
-          m_dimensions(dimensions),
-          m_strides(strides),
-          m_materialized_in_output(materialized_in_output),
-          m_strided_storage(strided_storage) {}
-
-    Scalar* m_data;
-    Dimensions m_dimensions;
-    Dimensions m_strides;
-    bool m_materialized_in_output;
-    bool m_strided_storage;
-  };
-
-  // Creates a storage for materialized block either from the block descriptor
-  // destination buffer, or allocates a new buffer with scratch allocator.
-  template <typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static Storage prepareStorage(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch,
-      bool allow_strided_storage = false) {
-    // Try to reuse destination as an output block buffer.
-    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
-
-    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/true,
-                     /*strided_storage=*/false);
-
-    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
-               allow_strided_storage) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
-                     /*materialized_in_output=*/true, /*strided_storage=*/true);
-
-    } else {
-      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/false,
-                     /*strided_storage=*/false);
-    }
-  }
-
-  // Creates a materialized block for the given descriptor from a memory buffer.
-  template <typename DataDimensions, typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
-      const Scalar* data, const DataDimensions& data_dims,
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
-
-    // If a tensor block dimensions covers a contiguous block of the underlying
-    // memory, we can skip block buffer memory allocation, and construct a block
-    // from existing `data` memory buffer.
-    //
-    // Example: (RowMajor layout)
-    //   data_dims:          [11, 12, 13, 14]
-    //   desc.dimensions():  [1,   1,  3, 14]
-    //
-    // In this case we can construct a TensorBlock starting at
-    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Find out how many inner dimensions have a matching size.
-    int num_matching_inner_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (data_dims[dim] != desc.dimensions()[dim]) break;
-      ++num_matching_inner_dims;
-    }
-
-    // All the outer dimensions must be of size `1`, except a single dimension
-    // before the matching inner dimension (`3` in the example above).
-    bool can_use_direct_access = true;
-    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (desc.dimension(dim) != 1) {
-        can_use_direct_access = false;
-        break;
-      }
-    }
-
-    if (can_use_direct_access) {
-      const Scalar* block_start = data + desc.offset();
-      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
-                                     block_start, desc.dimensions());
-
-    } else {
-      // Reuse destination buffer or allocate new buffer with scratch allocator.
-      const Storage storage = prepareStorage(desc, scratch);
-
-      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
-          TensorBlockIO;
-      typedef typename TensorBlockIO::Dst TensorBlockIODst;
-      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
-                           data, desc.offset());
-      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
-                           storage.data());
-
-      TensorBlockIO::Copy(dst, src);
-      return storage.AsTensorMaterializedBlock();
-    }
-  }
-
- private:
-  TensorBlockKind m_kind;
-  const Scalar* m_data;
-  Dimensions m_dimensions;
-  XprType m_expr;
-  bool m_valid_expr;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename UnaryOp, typename ArgTensorBlock>
-class TensorCwiseUnaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename ArgTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
-      type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
-      : m_arg_block(arg_block), m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  UnaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
-class TensorCwiseBinaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename LhsTensorBlock::XprType>::value ||
-      internal::is_void<typename RhsTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
-                          const typename RhsTensorBlock::XprType> >::type
-      XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
-                         const RhsTensorBlock& right_block,
-                         const BinaryOp& functor)
-      : m_left_block(left_block),
-        m_right_block(right_block),
-        m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const {
-    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
-  }
-
-  const Scalar* data() const { return NULL; }
-
-  void cleanup() {
-    m_left_block.cleanup();
-    m_right_block.cleanup();
-  }
-
- private:
-  LhsTensorBlock m_left_block;
-  RhsTensorBlock m_right_block;
-  BinaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorUnaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from a block of the underlying type (this is a
-// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
-
-template <typename BlockFactory, typename ArgTensorBlock>
-class TensorUnaryExprBlock {
-  typedef typename ArgTensorBlock::XprType ArgXprType;
-  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
-                       const BlockFactory& factory)
-      : m_arg_block(arg_block), m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorTernaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from three blocks of the underlying type.
-
-template <typename BlockFactory, typename Arg1TensorBlock,
-          typename Arg2TensorBlock, typename Arg3TensorBlock>
-class TensorTernaryExprBlock {
-  typedef typename Arg1TensorBlock::XprType Arg1XprType;
-  typedef typename Arg2TensorBlock::XprType Arg2XprType;
-  typedef typename Arg3TensorBlock::XprType Arg3XprType;
-
-  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
-                                       internal::is_void<Arg2XprType>::value ||
-                                       internal::is_void<Arg3XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
-                                              Arg3XprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
-                         const Arg2TensorBlock& arg2_block,
-                         const Arg3TensorBlock& arg3_block,
-                         const BlockFactory& factory)
-      : m_arg1_block(arg1_block),
-        m_arg2_block(arg2_block),
-        m_arg3_block(arg3_block),
-        m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const {
-    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
-                          m_arg3_block.expr());
-  }
-  const Scalar* data() const { return NULL; }
-  void cleanup() {
-    m_arg1_block.cleanup();
-    m_arg2_block.cleanup();
-    m_arg3_block.cleanup();
-  }
-
- private:
-  Arg1TensorBlock m_arg1_block;
-  Arg2TensorBlock m_arg2_block;
-  Arg3TensorBlock m_arg3_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// StridedLinearBufferCopy provides a method to copy data between two linear
-// buffers with different strides, with optimized paths for scatter/gather.
-
-template <typename Scalar, typename IndexType>
-class StridedLinearBufferCopy {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
- public:
-  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
-  enum class Kind {
-    Linear = 0,       // src_stride == 1 && dst_stride == 1
-    Scatter = 1,      // src_stride == 1 && dst_stride != 1
-    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
-    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
-    Gather = 4,       // dst_stride == 1
-    Random = 5        // everything else
-  };
-
-  struct Dst {
-    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    Scalar* data;
-  };
-
-  struct Src {
-    Src(IndexType o, IndexType s, const Scalar* d)
-        : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    const Scalar* data;
-  };
-
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
-                                                        const Src& src,
-                                                        const size_t count) {
-    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
-              src.data);
-  }
-
- private:
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const IndexType count, const IndexType dst_offset,
-      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const IndexType src_offset, const IndexType src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    const Scalar* src = &src_data[src_offset];
-    Scalar* dst = &dst_data[dst_offset];
-
-    if (!Vectorizable) {
-      for (Index i = 0; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-      return;
-    }
-
-    const IndexType vectorized_size = count - PacketSize;
-    IndexType i = 0;
-
-    if (kind == StridedLinearBufferCopy::Kind::Linear) {
-      // ******************************************************************** //
-      // Linear copy from `src` to `dst`.
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      eigen_assert(src_stride == 1 && dst_stride == 1);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          Packet p = ploadu<Packet>(src + i + j * PacketSize);
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
-      // Scatter from `src` to `dst`.
-      eigen_assert(src_stride == 1 && dst_stride != 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
-      // Fill `dst` with value at `*src`.
-      eigen_assert(src_stride == 0 && dst_stride == 1);
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      Packet p = pload1<Packet>(src);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
-      // Scatter `*src` into `dst`.
-      eigen_assert(src_stride == 0 && dst_stride != 1);
-      Packet p = pload1<Packet>(src);
-      for (; i <= vectorized_size; i += PacketSize) {
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
-      // Gather from `src` into `dst`.
-      eigen_assert(dst_stride == 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i * src_stride];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
-      // Random.
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-    } else {
-      eigen_assert(false);
-    }
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
-// It's possible to specify src->dst dimension mapping for the copy operation.
-// Dimensions of `dst` specify how many elements have to be copied, for the
-// `src` we need to know only stride to navigate through source memory buffer.
-
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO {
-  static const bool IsColMajor = (Layout == ColMajor);
-
-  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef DSizes<int, NumDims> DimensionsMap;
-
-  struct Dst {
-    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
-        IndexType dst_offset = 0)
-        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  struct Src {
-    Src(const Dimensions& src_strides, const Scalar* src,
-        IndexType src_offset = 0)
-        : strides(src_strides), data(src), offset(src_offset) {}
-
-    Dimensions strides;
-    const Scalar* data;
-    IndexType offset;
-  };
-
-  // Copies data to `dst` from `src`, using provided dimensions mapping:
-  //
-  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
-  //
-  // Returns the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
-      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
-    // Copy single scalar value from `src` to `dst`.
-    if (NumDims == 0) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Both `dst` and `src` must have contiguous innermost dimension. We also
-    // accept the special case with stride '0', because it's used as a trick to
-    // implement broadcasting.
-    {
-      int inner_dim = IsColMajor ? 0 : NumDims - 1;
-      EIGEN_UNUSED_VARIABLE(inner_dim);
-      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
-      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
-    }
-
-    // Give a shorter name to `dst_to_src_dim_map`.
-    const DimensionsMap& dim_map = dst_to_src_dim_map;
-
-    // Do not squeeze reordered inner dimensions.
-    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
-
-    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
-    // block, and we write data linearly into that dimension, reading it from
-    // the src. If dimensions are reordered, we might end up reading data from
-    // the src with `stride != 1`.
-    //
-    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
-    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
-
-    // Find the innermost dimension in the dst whose size is not 1. This is the
-    // effective inner dim.
-    int num_size_one_inner_dims = 0;
-    for (int i = 0; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      if (dst.dims[dst_dim] != 1) break;
-      num_size_one_inner_dims++;
-    }
-
-    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
-    if (num_size_one_inner_dims == NumDims) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
-    const int dst_stride1_dim = IsColMajor
-                                    ? num_size_one_inner_dims
-                                    : NumDims - num_size_one_inner_dims - 1;
-
-    // Dimension in the src that corresponds to the dst innermost dimension.
-    const int src_dim_for_dst_stride1_dim =
-        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
-
-    // Size of the innermost dimension (length of contiguous blocks of memory).
-    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
-
-    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
-    // `src` memory, so we can do less linear copy calls.
-    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      const IndexType dst_stride = dst.strides[dst_dim];
-      const IndexType src_stride = src.strides[dim_map[dst_dim]];
-      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
-        dst_inner_dim_size *= dst.dims[dst_dim];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    // Setup strides to read data from `src` and write to `dst`.
-    IndexType input_offset = src.offset;
-    IndexType output_offset = dst.offset;
-    IndexType input_stride =
-        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
-    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> it;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    int idx = 0;  // currently initialized iterator state index
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
-      if (dst.dims[dst_dim] == 1) continue;
-
-      it[idx].size = dst.dims[dst_dim];
-      it[idx].input_stride = src.strides[dim_map[dst_dim]];
-      it[idx].output_stride = dst.strides[dst_dim];
-
-      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-
-      idx++;
-    }
-
-    // Iterate copying data from src to dst.
-    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
-
-#define COPY_INNER_DIM(KIND)                                           \
-  IndexType num_copied = 0;                                            \
-  for (num_copied = 0; num_copied < block_total_size;                  \
-       num_copied += dst_inner_dim_size) {                             \
-    LinCopy::template Run<KIND>(                                       \
-        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
-        typename LinCopy::Src(input_offset, input_stride, src.data),   \
-        dst_inner_dim_size);                                           \
-                                                                       \
-    for (int j = 0; j < idx; ++j) {                                    \
-      if (++it[j].count < it[j].size) {                                \
-        input_offset += it[j].input_stride;                            \
-        output_offset += it[j].output_stride;                          \
-        break;                                                         \
-      }                                                                \
-      it[j].count = 0;                                                 \
-      input_offset -= it[j].input_span;                                \
-      output_offset -= it[j].output_span;                              \
-    }                                                                  \
-  }                                                                    \
-  return num_copied;
-
-    if (input_stride == 1 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Linear);
-    } else if (input_stride == 1 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Scatter);
-    } else if (input_stride == 0 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
-    } else if (input_stride == 0 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
-    } else if (output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Gather);
-    } else {
-      COPY_INNER_DIM(LinCopy::Kind::Random);
-    }
-
-#undef COPY_INNER_DIM
-  }
-
-  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
-  // the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
-                                                              const Src& src) {
-    DimensionsMap dst_to_src_map;
-    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
-    return Copy(dst, src, dst_to_src_map);
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : size(0),
-          count(0),
-          input_stride(0),
-          output_stride(0),
-          input_span(0),
-          output_span(0) {}
-
-    IndexType size;
-    IndexType count;
-    IndexType input_stride;
-    IndexType output_stride;
-    IndexType input_span;
-    IndexType output_span;
-  };
-
-  // Compute how many inner dimensions it's allowed to squeeze when doing IO
-  // between two tensor blocks. It's safe to squeeze inner dimensions, only
-  // if they are not reordered.
-  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
-    int num_squeezable_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      if (dim_map[dim] != dim) break;
-      num_squeezable_dims++;
-    }
-    return num_squeezable_dims;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
-// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
-//
-// Currently there is no way to write from a Tensor expression to a block of
-// memory, if dimensions are reordered. If you need to do that, you should
-// materialize a Tensor block expression into a memory buffer, and then use
-// TensorBlockIO to copy data between two memory buffers with a custom
-// `target->src` dimension map (see definition above).
-//
-// Also currently the innermost dimension of `target` must have a stride '1'
-// (contiguous in memory). This restriction could be lifted with a `pscatter`,
-// but in practice it's never needed, and there is a similar TensorBlockIO
-// workaround for that.
-//
-// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
-// where `src` is a tensor expression. Explore if it is possible to rewrite IO
-// to use expressions instead of pointers, and after that TensorBlockAssignment
-// will become an alias to IO.
-template <typename Scalar, int NumDims, typename TensorBlockExpr,
-          typename IndexType = Eigen::Index>
-class TensorBlockAssignment {
-  // We will use coeff/packet path to evaluate block expressions.
-  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
-      TensorBlockEvaluator;
-
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
-  template <bool Vectorizable, typename Evaluator>
-  struct InnerDimAssign {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      for (IndexType i = 0; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
-  template <typename Evaluator>
-  struct InnerDimAssign<true, Evaluator> {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      typedef typename packet_traits<Scalar>::type Packet;
-
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      const IndexType vectorized_size = count - PacketSize;
-      IndexType i = 0;
-
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          const IndexType idx = eval_offset + i + j * PacketSize;
-          Packet p = eval.template packet<Unaligned>(idx);
-          pstoreu<Scalar>(target + i + j * PacketSize, p);
-        }
-      }
-
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = eval.template packet<Unaligned>(eval_offset + i);
-        pstoreu<Scalar>(target + i, p);
-      }
-
-      for (; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
- public:
-  struct Target {
-    Target(const Dimensions& target_dims, const Dimensions& target_strides,
-           Scalar* target_data, IndexType target_offset = 0)
-        : dims(target_dims),
-          strides(target_strides),
-          data(target_data),
-          offset(target_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  static Target target(const Dimensions& target_dims,
-                       const Dimensions& target_strides, Scalar* target_data,
-                       IndexType target_offset = 0) {
-    return Target(target_dims, target_strides, target_data, target_offset);
-  }
-
-  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
-  static Target target(
-      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
-      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
-      Scalar* target_data, IndexType target_offset = 0) {
-    // DSizes constructor will do index type promotion if it's safe.
-    return Target(Dimensions(target_dims), Dimensions(target_strides),
-                  target_data, target_offset);
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Target& target, const TensorBlockExpr& expr) {
-    // Prepare evaluator for block expression.
-    DefaultDevice default_device;
-    TensorBlockEvaluator eval(expr, default_device);
-
-    // Tensor block expression dimension should match destination dimensions.
-    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
-
-    static const int Layout = TensorBlockEvaluator::Layout;
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Initialize output inner dimension size based on a layout.
-    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
-    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
-    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
-
-    // Target inner dimension stride must be '1'.
-    eigen_assert(target.strides[inner_dim_idx] == 1);
-
-    // Squeeze multiple inner dims into one if they are contiguous in `target`.
-    IndexType num_squeezed_dims = 0;
-    for (Index i = 1; i < NumDims; ++i) {
-      const Index dim = is_col_major ? i : NumDims - i - 1;
-      const IndexType target_stride = target.strides[dim];
-
-      if (output_inner_dim_size == target_stride) {
-        output_inner_dim_size *= target.dims[dim];
-        num_squeezed_dims++;
-      } else {
-        break;
-      }
-    }
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-
-    int idx = 0;  // currently initialized iterator state index
-    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
-      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
-
-      it[idx].count = 0;
-      it[idx].size = target.dims[dim];
-      it[idx].output_stride = target.strides[dim];
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-      idx++;
-    }
-
-    // We read block expression from the beginning, and start writing data to
-    // `target` at given offset.
-    IndexType input_offset = 0;
-    IndexType output_offset = target.offset;
-
-    // Iterate copying data from `eval` to `target`.
-    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
-      // Assign to `target` at current offset.
-      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
-                     TensorBlockEvaluator>::Run(target.data + output_offset,
-                                                output_inner_dim_size, eval,
-                                                input_offset);
-
-      // Move input offset forward by the number of assigned coefficients.
-      input_offset += output_inner_dim_size;
-
-      // Update index.
-      for (int j = 0; j < idx; ++j) {
-        if (++it[j].count < it[j].size) {
-          output_offset += it[j].output_stride;
-          break;
-        }
-        it[j].count = 0;
-        output_offset -= it[j].output_span;
-      }
-    }
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : count(0), size(0), output_stride(0), output_span(0) {}
-
-    IndexType count;
-    IndexType size;
-    IndexType output_stride;
-    IndexType output_span;
-  };
-};
-
-// -------------------------------------------------------------------------- //
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 938547f363cfbb..9b03cd08ba97a2 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,10 +18,10 @@ set(FLUID_CORE_NAME "core")
 if(WITH_AVX AND AVX_FOUND)
   set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx")
   if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "")
-    message(STATUS "WARNING: This is just a warning for publishing release.
+    message(STATUS "MESSAGE: This is just a message for publishing release.
       You are building AVX version without NOAVX core.
       So the wheel package may fail on NOAVX machine.
-      You can add -DFLUID_CORE_NAME=/path/to/your/core_noavx.* in cmake command
+      You can add -DNOAVX_CORE_FILE=/path/to/your/core_noavx.* in cmake command
       to get a full wheel package to resolve this warning.
       While, this version will still work on local machine.")
   endif()
diff --git a/python/paddle/README.md b/python/paddle/README.rst
similarity index 100%
rename from python/paddle/README.md
rename to python/paddle/README.rst
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 8dabe19f57c58f..861839256a3da7 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -31,6 +31,19 @@
 monkey_patch_variable()
 monkey_patch_math_varbase()
 import paddle.framework
+from .framework.dtype import dtype as dtype
+from paddle.framework.dtype import uint8
+from paddle.framework.dtype import int8
+from paddle.framework.dtype import int16
+from paddle.framework.dtype import int32
+from paddle.framework.dtype import int64
+from paddle.framework.dtype import float16
+from paddle.framework.dtype import float32
+from paddle.framework.dtype import float64
+from paddle.framework.dtype import bfloat16
+from paddle.framework.dtype import bool
+from paddle.framework.dtype import complex64
+from paddle.framework.dtype import complex128
 from .framework import VarBase as Tensor
 import paddle.compat
 import paddle.distributed
@@ -44,6 +57,7 @@
 import paddle.device
 import paddle.regularizer
 import paddle.incubate
+import paddle.autograd
 
 # TODO: define alias in tensor and framework directory
 
@@ -237,6 +251,7 @@
 from .framework import create_parameter  #DEFINE_ALIAS
 from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
+from .framework import NPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
 
 from .framework import grad  #DEFINE_ALIAS
@@ -261,6 +276,7 @@
 from .device import get_device
 from .device import is_compiled_with_cuda  #DEFINE_ALIAS
 from .device import is_compiled_with_xpu
+from .device import is_compiled_with_npu
 from .device import XPUPlace
 # from .tensor.tensor import Tensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
new file mode 100644
index 00000000000000..71110e95817879
--- /dev/null
+++ b/python/paddle/autograd/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
+
+from . import backward_mode
+from .backward_mode import backward
+from .py_layer import PyLayer, PyLayerContext
+
+__all__ = ['grad', 'backward', 'PyLayer', 'PyLayerContext']
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
new file mode 100644
index 00000000000000..96e4336abaa6fa
--- /dev/null
+++ b/python/paddle/autograd/backward_mode.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid import framework
+import paddle
+__all__ = ['backward']
+
+
+@framework.dygraph_only
+def backward(tensors, grad_tensors=None, retain_graph=False):
+    """
+    Compute the backward gradients of given tensors.
+    
+    Args:
+        tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.
+
+        grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
+            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
+            If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
+            Defaults to None.
+
+        retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+            like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+            :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+            Defaults to False.
+    
+    Returns:
+        NoneType: None
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor([[3, 2], [3, 4]], dtype='float32')
+
+            grad_tensor1 = paddle.to_tensor([[1,2], [2, 3]], dtype='float32')
+            grad_tensor2 = paddle.to_tensor([[1,1], [1, 1]], dtype='float32')
+
+            z1 = paddle.matmul(x, y)
+            z2 = paddle.matmul(x, y)
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, grad_tensor2], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, None], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2])
+            print(x.grad)
+            #[[10. 14.]
+            # [10. 14.]]
+
+    """
+
+    def check_tensors(in_out_list, name):
+        assert in_out_list is not None, "{} should not be None".format(name)
+
+        if isinstance(in_out_list, (list, tuple)):
+            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+            for each_var in in_out_list:
+                assert isinstance(
+                    each_var, paddle.
+                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+            return in_out_list
+        else:
+            assert isinstance(
+                in_out_list,
+                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
+                    name)
+            return [in_out_list]
+
+    tensors = check_tensors(tensors, "tensors")
+
+    assert len(tensors) == len(
+        set(tensors)
+    ), "The argument 'tensors' of paddle.autograd.backward contains duplicate paddle.Tensor object."
+
+    if grad_tensors is not None:
+        if not isinstance(grad_tensors, (list, tuple)):
+            grad_tensors = [grad_tensors]
+
+        for each_tensor in grad_tensors:
+            if each_tensor is not None:
+                assert isinstance(
+                    each_tensor, paddle.Tensor
+                ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
+    else:
+        grad_tensors = [None] * len(tensors)
+
+    if len(grad_tensors) > 0:
+        assert len(tensors) == len(
+            grad_tensors), "The length of grad_tensors must be equal to tensors"
+
+    assert isinstance(retain_graph, bool), "retain_graph must be True or False"
+
+    core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
+                              framework._dygraph_tracer())
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
new file mode 100644
index 00000000000000..c093565dc92fff
--- /dev/null
+++ b/python/paddle/autograd/py_layer.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid import core
+__all__ = ['PyLayer', 'PyLayerContext']
+
+
+class PyLayerContext(object):
+    """
+    The object of this class is a context that is used in PyLayer to enhance the function.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.autograd import PyLayer
+
+            class cus_tanh(PyLayer):
+                @staticmethod
+                def forward(ctx, x):
+                    # ctx is a object of PyLayerContext.
+                    y = paddle.tanh(x)
+                    ctx.save_for_backward(y)
+                    return y
+
+                @staticmethod
+                def backward(ctx, dy):
+                    # ctx is a object of PyLayerContext.
+                    y, = ctx.saved_tensor()
+                    grad = dy * (1 - paddle.square(y))
+                    return grad
+    """
+
+    def __init__(self):
+        self.container = None
+
+    def save_for_backward(self, *tensors):
+        """
+        Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
+        
+        .. note::
+            This API should be called at most once, and only inside `forward`. 
+
+        Args:
+            tensors(list of Tensors): Tensors to be stored.
+
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        # ctx is a context object that store some objects for backward.
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+
+        """
+        self.container = tensors
+
+    def saved_tensor(self):
+        """
+        Get the tensors stored by ``save_for_backward``.
+
+        Returns:
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            then return these tensors, otherwise return None.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        # ctx is a context object that store some objects for backward.
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+
+        return self.container
+
+
+def with_mateclass(meta, *bases):
+    class impl(meta):
+        def __new__(cls, name, temp_bases, attrs):
+            return meta(name, bases, attrs)
+
+    return type.__new__(impl, "impl", (), {})
+
+
+class CPyLayer(object):
+    @classmethod
+    @dygraph_only
+    def apply(cls, *args, **kwargs):
+        """
+        After building the custom PyLayer, run it through the ``apply``.
+
+        Args:
+            *args(tuple): input of PyLayer.
+            **kwargs(dict): input of PyLayer.
+
+        Returns:
+            tensors or other types : output of PyLayer.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x, func1, func2=paddle.square):
+                        ctx.func = func2
+                        y = func1(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - ctx.func(y))
+                        return grad
+
+
+                data = paddle.randn([2, 3], dtype="float64")
+                data.stop_gradient = False
+                # run custom Layer.
+                z = cus_tanh.apply(data, func1=paddle.tanh)
+        """
+        place = paddle.fluid.framework._current_expected_place()
+        with paddle.fluid.dygraph.no_grad():
+            return core.pylayer_apply(place, cls, *args, **kwargs)
+
+
+class PyLayerBackward(PyLayerContext):
+    def backward(self, *args, **kwargs):
+        with paddle.fluid.dygraph.no_grad():
+            return self._forward_cls.backward(*args, **kwargs)
+
+
+class LayerMeta(type):
+    def __init__(cls, name, bases, attrs):
+        cls._backward_function = type(name + '_backward', (PyLayerBackward, ),
+                                      {"_forward_cls": cls})
+
+        return super(LayerMeta, cls).__init__(name, bases, attrs)
+
+
+class PyLayer(with_mateclass(LayerMeta, CPyLayer)):
+    """
+    Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
+    1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
+    Their first argument should be a context and `None` can not be included in the returned result.
+    2. Input of backward contains a context as the first argument, and the rest arguments are the 
+    gradient of forward's output tensors. so the number of backward's input tensors equal to 
+    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, 
+    you can use `save_for_backward` to store the required tensors, and then use them in the backward.
+    3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
+    Output tensors of backward are the gradient of forward's input tensors, 
+    so the number of backward's output tensors equal to the number of forward input tensors.
+    After building the custom Layer, run it through the `apply` method.
+    
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.autograd import PyLayer
+
+            # Inherit from PyLayer
+            class cus_tanh(PyLayer):
+                @staticmethod
+                def forward(ctx, x, func1, func2=paddle.square):
+                    # ctx is a context object that store some objects for backward.
+                    ctx.func = func2
+                    y = func1(x)
+                    # Pass tensors to backward.
+                    ctx.save_for_backward(y)
+                    return y
+
+                @staticmethod
+                # forward has only one output, so there is only one gradient in the input of backward.
+                def backward(ctx, dy):
+                    # Get the tensors passed by forward.
+                    y, = ctx.saved_tensor()
+                    grad = dy * (1 - ctx.func(y))
+                    # forward has only one input, so only one gradient tensor is returned.
+                    return grad
+
+
+            data = paddle.randn([2, 3], dtype="float64")
+            data.stop_gradient = False
+            z = cus_tanh.apply(data, func1=paddle.tanh)
+            z.mean().backward()
+
+            print(data.grad)
+
+    """
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
+        the first argument, followed by any number of arguments (tensors or other types). 
+        `None` can not be included in the returned result.
+
+        Args:
+            *args(tuple): input of PyLayer.
+            **kwargs(dict): input of PyLayer.
+
+        Returns:
+            tensors or other types : output of PyLayer.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+        raise NotImplementedError(
+            "You must implement the forward function for PyLayer.")
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        """
+        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
+        It must accept a object of `PyLayerContext` as the first argument, and the rest 
+        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        are the gradient of forward's input tensors.
+
+        Args:
+            *args(tuple): The gradient of forward's output tensor(s).
+            **kwargs(dict): The gradient of forward's output tensor(s).
+
+        Returns:
+            Tensor or list of Tensors: The gradient of forward's input tensor(s).
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+
+        raise NotImplementedError(
+            "You must implement the backward function for PyLayer.")
diff --git a/python/paddle/device.py b/python/paddle/device.py
index 81b1dfcc745a4a..035d240e713fe8 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -32,12 +32,30 @@
     #            'cuda_places',
     #            'CUDAPinnedPlace',
     #            'CUDAPlace',
-    'is_compiled_with_cuda'
+    'is_compiled_with_cuda',
+    'is_compiled_with_npu'
 ]
 
 _cudnn_version = None
 
 
+# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future
+# for consistent.
+def is_compiled_with_npu():
+    """
+    Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.
+
+    Returns (bool): `True` if NPU is supported, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_npu = paddle.is_compiled_with_npu()
+    """
+    return core.is_compiled_with_npu()
+
+
 def is_compiled_with_xpu():
     """
     Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
@@ -101,28 +119,7 @@ def get_cudnn_version():
         return _cudnn_version
 
 
-def set_device(device):
-    """
-    Paddle supports running calculations on various types of devices, including CPU, GPU and XPU.
-    They are represented by string identifiers. This function can specify the global device
-    which the OP will run.
-
-    Parameters:
-        device(str): This parameter determines the specific running device.
-            It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
-            index of the GPUs or XPUs. 
-
-    Examples:
-
-     .. code-block:: python
-            
-        import paddle
-
-        paddle.set_device("cpu")
-        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
-        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
-        data = paddle.stack([x1,x2], axis=1)
-    """
+def _convert_to_place(device):
     lower_device = device.lower()
     if lower_device == 'cpu':
         place = core.CPUPlace()
@@ -165,6 +162,32 @@ def set_device(device):
             device_id = device_info_list[1]
             device_id = int(device_id)
             place = core.XPUPlace(device_id)
+    return place
+
+
+def set_device(device):
+    """
+    Paddle supports running calculations on various types of devices, including CPU, GPU and XPU.
+    They are represented by string identifiers. This function can specify the global device
+    which the OP will run.
+
+    Parameters:
+        device(str): This parameter determines the specific running device.
+            It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
+            index of the GPUs or XPUs. 
+
+    Examples:
+
+     .. code-block:: python
+            
+        import paddle
+
+        paddle.set_device("cpu")
+        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
+        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
+        data = paddle.stack([x1,x2], axis=1)
+    """
+    place = _convert_to_place(device)
     framework._set_expected_place(place)
     return place
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index a6eb896802f112..2756dea72e84a9 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -26,6 +26,9 @@
 import paddle.fluid.core as core
 
 __all__ = [
+    'wait',
+    'new_group',
+    'get_group',
     'broadcast',
     'all_reduce',
     'reduce',
@@ -75,30 +78,220 @@ class ReduceOp:
     PROD = 3
 
 
-class _Group():
-    """The abstract representation of group."""
+class Group():
+    """
+    The abstract representation of group.
+    """
 
-    def __init__(self, rank, rank_num):
+    def __init__(self, rank, rank_num, id=0, ranks=[]):
         self.rank = rank
         self.nranks = rank_num
+        self.id = id
+        self.ranks = ranks
+
+    def is_member(self):
+        if self.rank < 0:
+            return False
+        if self.nranks < 2:
+            return False
+        return True
+
+    def get_group_rank(self, rank):
+        if self.id == 0:
+            return rank
+        if self.is_member() and rank in self.ranks:
+            return self.ranks.index(rank)
+        else:
+            return -1
+
+
+_global_env = None
+
+
+def _get_global_env():
+    global _global_env
+    if not _global_env:
+        _global_env = paddle.distributed.ParallelEnv()
+    return _global_env
+
+
+# group map : the map of all group, 0 for GlobalGroup
+# Dict[int, Group]
+_group_map = {}
+
+
+def _get_group_map():
+    global _group_map
+    if not _group_map:
+        genv = _get_global_env()
+        _group_map[0] = Group(genv.rank, genv.world_size, 0)
+    return _group_map
+
+
+def _get_global_group():
+    return _get_group_map()[0]
+
+
+def _new_ring_id():
+    return len(_get_group_map()) + max(_get_global_env().nrings, 9)
+
+
+def get_group(id=0):
+    """
+
+    Get group instance by group id.
+
+    Args:
+        id (int): the group id. Default value is 0.
+
+    Returns:
+        Group: the group instance.
+
+    Examples:
+        .. code-block:: python
+
+            ...
+            gid = paddle.distributed.new_group([2,4,6])
+            paddle.distributed.get_group(gid.id)
+
+    """
+
+    gm = _get_group_map()
+    return gm[group] if group in gm else None
+
+
+def new_group(ranks=None, backend=None):
+    """
+
+    Creates a new distributed communication group.
+
+    Args:
+        ranks (list): The global ranks of group members.
+        backend (str): The backend used to create group, only nccl is supported now.
+
+    Returns:
+        Group: The group instance.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.distributed.init_parallel_env()
+            tindata = paddle.randn(shape=[2, 3])
+            gp = paddle.distributed.new_group([2,4,6])
+            paddle.distributed.all_reduce(tindata, group=gp, use_calc_stream=False)
+
+    """
+
+    if not backend:
+        backend = 'nccl'
+    assert backend == 'nccl', ("backend other than nccl is not supported yet")
+
+    genv = _get_global_env()
+    global_rank = genv.rank
+
+    ring_id = _new_ring_id()
+
+    global _group_map
+    if global_rank not in ranks:
+        gp = Group(-1, -1, ring_id, ranks)
+        _group_map[ring_id] = gp
+        return gp
+
+    ranks = sorted(ranks)
+    group_rank = ranks.index(global_rank)
+    group_size = len(ranks)
+    gp = Group(group_rank, group_size, ring_id, ranks)
+    _group_map[ring_id] = gp
+
+    if group_size < 2:
+        return gp
+
+    strategy = core.ParallelStrategy()
+    strategy.nranks = group_size
+    strategy.local_rank = group_rank
+    strategy.trainer_endpoints = [genv.trainer_endpoints[i] for i in ranks]
+    strategy.current_endpoint = genv.current_endpoint
+    strategy.nrings = 1
+
+    if core.is_compiled_with_cuda():
+        place = core.CUDAPlace(genv.device_id)
+        core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id)
+    else:
+        assert False, ("no cuda device found")
+
+    return gp
+
 
+def wait(tensor, group=None, use_calc_stream=True):
+    """
+
+    wait to sync stream for group.
 
-# NOTE(chenweihang): Lazily initialized global group information
-# If we initialize _default_group when import module, it will 
-# not update when we use spawn to run multi-process training 
-_default_group = None
+    Args:
+        tensor (Tensor): The Tensor used before sync.
+        group (Group): The Group instance to perform sync.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
+    Returns:
+        None.
 
-def _get_global_default_group():
-    global _default_group
-    if _default_group is None:
-        _default_group = _Group(
-            int(os.getenv("PADDLE_TRAINER_ID", "0")),
-            int(os.getenv("PADDLE_TRAINERS_NUM", "1")))
-    return _default_group
+    Examples:
+        .. code-block:: python
 
+            import paddle
+
+            paddle.distributed.init_parallel_env()
+            tindata = paddle.randn(shape=[2, 3])
+            paddle.distributed.all_reduce(tindata, use_calc_stream=True)
+            paddle.distributed.wait(tindata)
 
-def broadcast(tensor, src, group=0):
+    """
+
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+
+    if use_calc_stream:
+        _sync_calc_stream(tensor)
+    else:
+        _sync_comm_stream(tensor, ring_id)
+
+
+def _sync_calc_stream(tensor):
+
+    if in_dygraph_mode():
+        return core.ops.c_sync_calc_stream(tensor, tensor)
+
+    op_type = 'c_sync_calc_stream'
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]}, )
+
+
+def _sync_comm_stream(tensor, ring_id=0):
+
+    if in_dygraph_mode():
+        return core.ops.c_sync_comm_stream([tensor], [tensor], 'ring_id',
+                                           ring_id)
+
+    op_type = 'c_sync_comm_stream'
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={'ring_id': ring_id}, )
+
+
+def broadcast(tensor, src, group=None, use_calc_stream=True):
     """
 
     Broadcast a tensor from the source to all others.
@@ -107,7 +300,9 @@ def broadcast(tensor, src, group=0):
         tensor (Tensor): The Tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
             should be float16, float32, float64, int32 or int64.
         src (int): The source rank.
-        group (int): The process group to work on. It is Optional.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -130,17 +325,26 @@ def broadcast(tensor, src, group=0):
             out = data.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(src, int):
+        raise ValueError("src should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gsrc = src if group is None else group.get_group_rank(src)
+    assert gsrc >= 0, ("src rank out of group, need global rank")
+
     if in_dygraph_mode():
-        return core.ops.c_broadcast(tensor, tensor, 'root', src,
-                                    'use_calc_stream', True, 'ring_id', group)
+        return core.ops.c_broadcast(tensor, tensor, 'root', gsrc,
+                                    'use_calc_stream', use_calc_stream,
+                                    'ring_id', ring_id)
 
     op_type = 'c_broadcast'
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'broadcast')
-    if not isinstance(src, int) or not isinstance(group, int):
-        raise ValueError("Both the type of 'src' and 'group' for broadcast "
-                         "should be int.")
 
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
@@ -148,13 +352,13 @@ def broadcast(tensor, src, group=0):
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
         attrs={
-            'root': src,
-            'use_calc_stream': True,
-            'ring_id': group,
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
+            'ring_id': ring_id,
         })
 
 
-def all_reduce(tensor, op=ReduceOp.SUM, group=0):
+def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
     """
 
     Reduce a tensor over all ranks so that all get the result.
@@ -162,8 +366,10 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
     Args:
         tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
             should be float16, float32, float64, int32 or int64.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
-        group (int): Optional. The process group to work on.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -187,19 +393,25 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
             out = data.numpy()
             # [[5, 7, 9], [5, 7, 9]]
     """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
             return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+                                            use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MAX:
             return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+                                            use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MIN:
             return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+                                            use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.PROD:
             return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream',
-                                             True, 'ring_id', group)
+                                             use_calc_stream, 'ring_id',
+                                             ring_id)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -217,18 +429,18 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
         op_type = 'c_allreduce_min'
     elif op == ReduceOp.PROD:
         op_type = 'c_allreduce_prod'
-    if not isinstance(group, int):
-        raise ValueError("The type of 'group' for all_reduce should be int.")
+    if not isinstance(ring_id, int):
+        raise ValueError("The type of 'ring_id' for all_reduce should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
-        attrs={'ring_id': group,
-               'use_calc_stream': True})
+        attrs={'ring_id': ring_id,
+               'use_calc_stream': use_calc_stream})
 
 
-def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
+def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
     """
 
     Reduce a tensor to the destination from all others.
@@ -237,8 +449,10 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
         tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
             should be float16, float32, float64, int32 or int64.
         dst (int): The destination rank id.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
-        group (int): The id of the process group to work on.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -261,20 +475,33 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
             out = data.numpy()
             # [[5, 7, 9], [5, 7, 9]]
     """
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(dst, int):
+        raise ValueError("dst should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gdst = dst if group is None else group.get_group_rank(dst)
+    assert gdst >= 0, ("dst rank out of group, need global rank")
+
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
             return core.ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.MAX:
             return core.ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.MIN:
             return core.ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.PROD:
             return core.ops.c_reduce_prod(tensor, tensor, 'use_calc_stream',
-                                          True, 'ring_id', group, 'root_id',
-                                          dst)
+                                          use_calc_stream, 'ring_id', ring_id,
+                                          'root_id', gdst)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -295,22 +522,19 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
     elif op == ReduceOp.PROD:
         op_type = 'c_reduce_prod'
 
-    if not isinstance(dst, int) or not isinstance(group, int):
-        raise ValueError("Both the type of 'dst' and 'group' for reduce "
-                         "should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
         attrs={
-            'ring_id': group,
-            'use_calc_stream': True,
-            'root_id': dst,
+            'ring_id': ring_id,
+            'use_calc_stream': use_calc_stream,
+            'root_id': gdst,
         })
 
 
-def all_gather(tensor_list, tensor, group=0):
+def all_gather(tensor_list, tensor, group=None, use_calc_stream=True):
     """
 
     Gather tensors from all participators and all get the result.
@@ -320,7 +544,9 @@ def all_gather(tensor_list, tensor, group=0):
             should be float16, float32, float64, int32 or int64.
         tensor (Tensor): The Tensor to send. Its data type
             should be float16, float32, float64, int32 or int64.
-        group (int): The id of the process group to work on.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -348,13 +574,19 @@ def all_gather(tensor_list, tensor, group=0):
                 data2 = paddle.to_tensor(np_data2)
                 paddle.distributed.all_gather(tensor_list, data2)
     """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+    nranks = _get_global_group().nranks if group is None else group.nranks
+
     op_type = 'c_allgather'
     helper = LayerHelper(op_type, **locals())
     out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-    _default_group = _get_global_default_group()
+
     if in_dygraph_mode():
-        core.ops.c_allgather(tensor, out, 'use_calc_stream', True, 'ring_id',
-                             group, 'nranks', _default_group.nranks)
+        core.ops.c_allgather(tensor, out, 'use_calc_stream', use_calc_stream,
+                             'ring_id', ring_id, 'nranks', nranks)
     else:
         if not isinstance(tensor_list, list):
             raise ValueError("The type of 'tensor_list' for all_gather "
@@ -367,23 +599,20 @@ def all_gather(tensor_list, tensor, group=0):
         check_variable_and_dtype(
             tensor, 'tensor',
             ['float16', 'float32', 'float64', 'int32', 'int64'], 'all_gather')
-        if not isinstance(group, int):
-            raise ValueError("The type of 'group' for all_gather "
-                             "should be int.")
         helper.append_op(
             type=op_type,
             inputs={'X': [tensor]},
             outputs={'Out': [out]},
             attrs={
-                'ring_id': group,
-                'use_calc_stream': True,
-                'nranks': _default_group.nranks
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+                'nranks': nranks
             })
 
-    tensor_list.extend(paddle.split(out, _default_group.nranks, 0))
+    tensor_list.extend(paddle.split(out, nranks, 0))
 
 
-def scatter(tensor, tensor_list=None, src=0, group=0):
+def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
     """
 
     Scatter a tensor to all participators.
@@ -392,9 +621,11 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
         tensor (Tensor): The output Tensor. Its data type
             should be float16, float32, float64, int32 or int64.
         tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32 or int64.
-        src (int): The source rank id.
-        group (int): The id of the process group to work on.
+            should be float16, float32, float64, int32 or int64. Default value is None.
+        src (int): The source rank id. Default value is 0.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -422,45 +653,52 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
                 paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
             out = data1.numpy()
     """
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(src, int):
+        raise ValueError("src should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gsrc = src if group is None else group.get_group_rank(src)
+    assert gsrc >= 0, ("src rank out of group, need global rank")
+    rank = _get_global_group().rank if group is None else group.rank
+    nranks = _get_global_group().nranks if group is None else group.nranks
+
     op_type = 'c_scatter'
-    _default_group = _get_global_default_group()
-    rank = _default_group.rank
-    nranks = _default_group.nranks
-    if rank != src:
+
+    if rank != gsrc:
         tensor_list = []
         for _ in range(nranks):
             tensor_list.append(tensor)
     temp = paddle.concat(tensor_list, axis=0)
     if in_dygraph_mode():
-        return core.ops.c_scatter(temp, tensor, 'use_calc_stream', True,
-                                  'ring_id', group, 'nranks',
-                                  _default_group.nranks, 'root', src)
+        return core.ops.c_scatter(temp, tensor, 'use_calc_stream',
+                                  use_calc_stream, 'ring_id', ring_id, 'nranks',
+                                  nranks, 'root', gsrc)
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'scatter')
-    if not isinstance(group, int) or not isinstance(src, int):
-        raise ValueError("Both the type of 'src' and 'group' for scatter "
-                         "should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [temp]},
         outputs={'Out': [tensor]},
         attrs={
-            'ring_id': group,
-            'root': src,
-            'use_calc_stream': True,
+            'ring_id': ring_id,
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
             'nranks': nranks,
         })
 
 
-def barrier(group=0):
+def barrier(group=None):
     """
 
     Barrier among all participators in the group.
 
     Args:
-        group (int): The id of the process group to work on.
+        group (Group): The group instance return by new_group or None for global default group.
 
     Returns:
         None.
@@ -475,18 +713,23 @@ def barrier(group=0):
             init_parallel_env()
             paddle.distributed.barrier()
     """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+
     op_type = 'barrier'
     temp = fill_constant([1], dtype="int32", value="1")
     if in_dygraph_mode():
-        return core.ops.barrier(temp, temp, 'ring_id', group)
-    if not isinstance(group, int):
+        return core.ops.barrier(temp, temp, 'ring_id', ring_id)
+    if not isinstance(ring_id, int):
         raise ValueError("The type of 'group' for barrier must be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [temp]},
         outputs={'Out': [temp]},
-        attrs={'ring_id': group})
+        attrs={'ring_id': ring_id})
 
 
 def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
@@ -515,10 +758,10 @@ def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
 
     if gather_out:
         if axis == 0:
-            paddle.distributed.all_reduce(linear_out, group=0)
+            paddle.distributed.all_reduce(linear_out)
         else:
             output = []
-            paddle.distributed.all_gather(output, linear_out, group=0)
+            paddle.distributed.all_gather(output, linear_out)
             linear_out = paddle.concat(output, axis=len(linear_out.shape) - 1)
     return linear_out
 
@@ -559,7 +802,7 @@ def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
     main_block = paddle.static.default_main_program().global_block()
     startup_block.vars[embedding.weight.name].is_distributed = True
     main_block.vars[embedding.weight.name].is_distributed = True
-    paddle.distributed.all_reduce(emb_out, group=0)
+    paddle.distributed.all_reduce(emb_out, group=None)
     return emb_out
 
 
@@ -584,7 +827,7 @@ def split(x,
         With parallel embedding, the weight is split into num_partitions partitions, each
         of which is a matrix with (N/num_partitions + 1) rows and M column where the last
         row as the padding idx.
-        
+
         Suppose we split the NxM weight into two partitons on device_0 and device_1
         respectively. Then, one each device, the final weight has (N/2 + 1) rows with the
         index range from 0 to N/2. On device_0, all values in the input within [0, N/2 -1]
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index bd8492ecfa7ee7..784004269d797b 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -20,16 +20,14 @@
 from .dataset import *
 from .data_generator import MultiSlotDataGenerator, MultiSlotStringDataGenerator
 from . import metrics
+from .base.topology import CommunicateTopology, HybridCommunicateGroup
+from .meta_parallel import random, layers
 
 __all__ = [
-    "DistributedStrategy",
-    "UtilBase",
-    "UserDefinedRoleMaker",
-    "PaddleCloudRoleMaker",
-    "Fleet",
-    "MultiSlotDataGenerator",
-    "MultiSlotStringDataGenerator",
-    "Role",
+    "DistributedStrategy", "UtilBase", "UserDefinedRoleMaker",
+    "PaddleCloudRoleMaker", "Fleet", "MultiSlotDataGenerator",
+    "MultiSlotStringDataGenerator", "Role", "CommunicateTopology",
+    "HybridCommunicateGroup"
 ]
 
 fleet = Fleet()
@@ -40,6 +38,17 @@
 is_first_worker = fleet.is_first_worker
 worker_index = fleet.worker_index
 worker_num = fleet.worker_num
+node_num = fleet.node_num
+rank = fleet.worker_index
+nranks = fleet.worker_num
+world_size = fleet.worker_num
+# device id in current trainer
+local_device_ids = fleet.local_device_ids
+# device ids in world
+world_device_ids = fleet.world_device_ids
+# rank in node
+local_rank = fleet.local_rank
+rank_in_node = local_rank
 is_worker = fleet.is_worker
 worker_endpoints = fleet.worker_endpoints
 server_num = fleet.server_num
@@ -64,3 +73,4 @@
 state_dict = fleet.state_dict
 set_state_dict = fleet.set_state_dict
 shrink = fleet.shrink
+get_hybrid_communicate_group = fleet.get_hybrid_communicate_group
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
new file mode 100644
index 00000000000000..7a4a4a189c92e4
--- /dev/null
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import paddle
+from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
+
+
+def _get_ascend_rankfile(rank_table_file_path):
+    """
+    Args:
+    rank_table_file_path: ascend npu rank file json
+    {
+        "status": "completed",
+        "version": "1.0",
+        "server_count": "2",
+        "server_list": [
+            {
+                "server_id": "192.168.24.217",
+                "device": [
+                    {
+                        "device_id": "0",
+                        "device_ip": "192.1.184.23",
+                        "rank_id": "0"
+                    },
+                    {
+                        "device_id": "1",
+                        "device_ip": "192.2.21.93",
+                        "rank_id": "1"
+                    }
+                ]
+            },
+            {
+                "server_id": "192.168.26.177",
+                "device": [
+                    {
+                        "device_id": "0",
+                        "device_ip": "192.1.94.132",
+                        "rank_id": "2"
+                    },
+                    {
+                        "device_id": "1",
+                        "device_ip": "192.2.94.30",
+                        "rank_id": "3"
+                    }
+                ]
+            }
+        ]
+    }
+
+    Returns:
+        node_ips: node ip list
+        device_count: number of npu per machine
+        
+    """
+    json_data = None
+    with open(rank_table_file_path) as json_file:
+        json_data = json.load(json_file)
+
+    node_ips = []
+    device_count = 0
+    server_list = json_data['server_list']
+    for server in server_list:
+        node_ips.append(server['server_id'])
+        device_list = server['device']
+        device_count = len(device_list)
+
+    return node_ips, device_count
+
+
+def get_cloud_cluster(rank_table_file=None,
+                      device_mode=DeviceMode.ASCEND_NPU,
+                      start_port=6070):
+    """
+    Args:
+    rank_table_file: string, ascend npu rank file path
+    device_mode: DeviceMode(Int)
+    start_port: the start port of current runtime env
+    """
+    if rank_table_file:
+        # multi trainers
+        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
+        if len(node_ips) == 1:
+            node_ip = node_ips[0]
+        else:
+            node_index = os.environ.get("PADDLE_TRAINER_ID")
+            node_ip = None
+            if node_index:
+                node_ip = node_ips[int(node_index)]
+            else:
+                _, node_ip = get_host_name_ip()
+
+        assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
+            % (node_ip, node_ips)
+    else:
+        # single trainer (single ascend card)
+        node_ips = ["127.0.0.1"]
+        node_ip = node_ips[0]
+        device_count = 1
+
+    devices_per_proc = [str(x) for x in range(device_count)]
+    free_ports = [
+        x for x in range(start_port, start_port + len(devices_per_proc))
+    ]
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+
+    return get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
+                       devices_per_proc)
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index f79013d7347c00..04cb7447e36973 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -620,6 +620,34 @@ def last_comm_group_size_MB(self, value):
         else:
             raise ValueError("last_comm_group_size_MB should be greater than 0")
 
+    @property
+    def find_unused_parameters(self):
+        """
+        Indicating whether we are using find_unused_parameters to 
+        find unused parameters in DataParallel.
+
+        Default value: True
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.find_unused_parameters = True
+        """
+
+        return self.strategy.find_unused_parameters
+
+    @find_unused_parameters.setter
+    @is_strict_auto
+    def find_unused_parameters(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.find_unused_parameters = flag
+        else:
+            print(
+                "WARNING: find_unused_parameters should have value of bool type")
+
     @property
     def _fuse_grad_size_in_TFLOPS(self):
         return self.strategy.fuse_grad_size_in_TFLOPS
@@ -839,6 +867,40 @@ def pipeline_configs(self, configs):
                           "pipeline_configs")
         assign_configs_value(self.strategy.pipeline_configs, configs)
 
+    @property
+    def hybrid_configs(self):
+        """
+        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism 
+        needs to meet the following relationships
+
+        total_number_GPUs = dp_degree * mp_degree * pp_degree
+
+        **Note**:
+            dp_degree(int): set number of GPUs in a data parallel group. Default -1.
+                                    This value should be an integer greater than 0.
+                                    If it is not set, or set to -1, its value will be inferred 
+                                    based on the total number of cards.
+            mp_degree(int): set number of GPUs in a model parallel group. Default 1
+            pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
+
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.hybrid_configs = {
+                "dp_degree": 1,
+                "mp_degree": 2,
+                "pp_degree": 1}
+        """
+        return get_msg_dict(self.strategy.hybrid_configs)
+
+    @hybrid_configs.setter
+    def hybrid_configs(self, configs):
+        check_configs_key(self.strategy.hybrid_configs, configs,
+                          "hybrid_configs")
+        assign_configs_value(self.strategy.hybrid_configs, configs)
+
     @property
     def localsgd(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 19ba637cc96809..7ed5017b815778 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -26,6 +26,7 @@
 from .runtime_factory import RuntimeFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 from paddle.fluid.dygraph import parallel_helper
+from . import topology as tp
 
 
 def _inited_runtime_handler_(func):
@@ -234,6 +235,48 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                         self._user_defined_strategy.nccl_comm_num)
                 paddle.distributed.init_parallel_env()
 
+            # init hybrid parallel environment in dygraph
+            if tp._HYBRID_PARALLEL_GROUP is None:
+                self._init_hybrid_parallel_env()
+            else:
+                warnings.warn(
+                    "The dygraph hybrid parallel environment has been initialized."
+                )
+
+    def _init_hybrid_parallel_env(self):
+        """initialize the hybrid environment
+        """
+        self.hybrid_configs = self._user_defined_strategy.hybrid_configs
+        self.dp_degree = self.hybrid_configs["dp_degree"]
+        self.mp_degree = self.hybrid_configs["mp_degree"]
+        self.pp_degree = self.hybrid_configs["pp_degree"]
+
+        assert self.mp_degree >= 0, "mp_degree should be greater or equal to 0"
+        assert self.pp_degree >= 0, "pp_degree should be greater or equal to 0"
+
+        self.mp_degree = max(self.mp_degree, 1)
+        self.pp_degree = max(self.pp_degree, 1)
+
+        if self.dp_degree < 0:
+            nranks = paddle.distributed.get_world_size()
+            self.dp_degree = nranks // (self.mp_degree * self.pp_degree)
+
+        self.dp_degree = max(self.dp_degree, 1)
+
+        self._topology = tp.CommunicateTopology(
+            hybrid_group_names=["data", "pipe", "model"],
+            dims=[self.dp_degree, self.pp_degree, self.mp_degree])
+
+        self._hcg = tp.HybridCommunicateGroup(self._topology)
+
+    def get_hybrid_communicate_group(self):
+        assert self._hcg is not None
+        return self._hcg
+
+    def get_hybrid_parallel_topology(self):
+        assert self._topology is not None
+        return self._topology
+
     def is_first_worker(self):
         """
         Check whether the node is the first instance of worker.
@@ -289,6 +332,18 @@ def worker_num(self):
         """
         return self._role_maker._worker_num()
 
+    def node_num(self):
+        return self._role_maker._get_node_num()
+
+    def local_rank(self):
+        return self._role_maker._get_local_rank()
+
+    def local_device_ids(self):
+        return self._role_maker._get_local_device_ids()
+
+    def world_device_ids(self):
+        return self._role_maker._get_world_device_ids()
+
     def is_worker(self):
         """
         Check whether the node is an instance of worker.
@@ -628,12 +683,13 @@ def distributed_optimizer(self, optimizer, strategy=None):
         self.user_defined_optimizer = optimizer
 
         if strategy is not None:
-            warnings.warn(
-                "It is recommended to use DistributedStrategy "
-                "in fleet.init(). The strategy here is only for compatibility. "
-                "If the strategy in fleet.distributed_optimizer() is "
-                "not None, then it will overwrite the DistributedStrategy in fleet.init(), "
-                "which will take effect in distributed training.")
+            if self._is_collective:
+                warnings.warn(
+                    "It is recommended to use DistributedStrategy "
+                    "in fleet.init(). The strategy here is only for compatibility. "
+                    "If the strategy in fleet.distributed_optimizer() is "
+                    "not None, then it will overwrite the DistributedStrategy in fleet.init(), "
+                    "which will take effect in distributed training.")
             self._user_defined_strategy = copy.deepcopy(strategy)
 
         self._context = {}
@@ -705,7 +761,9 @@ def forward(self, x):
             model,
             comm_buffer_size=self._user_defined_strategy.fuse_grad_size_in_MB,
             last_comm_buffer_size=self._user_defined_strategy.
-            last_comm_group_size_MB)
+            last_comm_group_size_MB,
+            find_unused_parameters=self._user_defined_strategy.
+            find_unused_parameters)
         return self.model
 
     @dygraph_only
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index a8683aea97fff4..62c8faa0757c66 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -622,6 +622,29 @@ def _node_num(self):
             self._generate_role()
         return self._nodes_num
 
+    def _get_node_num(self):
+        """
+        return the training node number
+        """
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._nodes_num
+
+    def _get_local_rank(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._local_rank
+
+    def _get_local_device_ids(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._local_device_ids
+
+    def _get_world_device_ids(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._world_device_ids
+
     def _get_trainer_endpoints(self):
         """
         get endpoint of all trainers
@@ -782,6 +805,9 @@ def _collective_env(self):
         self._trainers_num = len(self._worker_endpoints)
         self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
+        self._local_rank = os.getenv("PADDLE_RANK_IN_NODE")
+        self._local_device_ids = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
+        self._world_device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
 
     def _gloo_init(self):
         # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
new file mode 100644
index 00000000000000..4dca860212c7ac
--- /dev/null
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -0,0 +1,185 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import paddle
+import collections
+import numpy as np
+from itertools import product
+from functools import reduce
+__all__ = ['CommunicateTopology', 'HybridCommunicateGroup']
+
+_HYBRID_PARALLEL_GROUP = None
+
+
+class CommunicateTopology(object):
+    def __init__(self, hybrid_group_names, dims):
+        self._parallel_names = hybrid_group_names
+        self._dims = dims
+        self.coordinate = collections.namedtuple('Coordinate',
+                                                 self._parallel_names)
+        self._world_size = reduce(lambda x, y: x * y, self._dims)
+
+        ranges = [range(d) for d in self._dims]
+        all_coordinate = [self.coordinate(*x) for x in product(*ranges)]
+
+        self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
+        self._rank2coord = dict(
+            zip(self._coord2rank.values(), self._coord2rank.keys()))
+
+    def get_hybrid_group_names(self):
+        return self._parallel_names
+
+    def get_dim(self, axis_name):
+        return self._dims[self._parallel_names.index(axis_name)]
+
+    def world_size(self):
+        return self._world_size
+
+    def get_rank(self, **args):
+        assert len(args) == len(self._dims)
+        key = self.coordinate(**args)
+        assert key in self._coord2rank.keys()
+        return self._coord2rank[key]
+
+    def get_coord(self, rank):
+        assert rank < self._world_size
+        assert rank in self._rank2coord.keys()
+        return self._rank2coord[rank]
+
+    def get_axis_list(self, axis_name, index):
+        axis = self._parallel_names.index(axis_name)
+        ranks = [
+            self._coord2rank[coord] for coord in self._coord2rank.keys()
+            if coord[axis] == index
+        ]
+        ranks.sort()
+        return ranks
+
+    def get_dim_size(self, axis_name):
+        assert axis_name in self._parallel_names
+        return self._dims[self._parallel_names.index(axis_name)]
+
+    def get_comm_list(self, axis_name):
+        assert axis_name in self._parallel_names
+        other_axis_names = [
+            name for name in self._parallel_names if name != axis_name
+        ]
+
+        ranges = []
+        for name in other_axis_names:
+            dim_num = self.get_dim_size(name)
+            ranges.append(range(dim_num))
+
+        all_result = []
+        for x in product(*ranges):
+            key_coord = {}
+            for other_name in other_axis_names:
+                key_coord[other_name] = x[other_axis_names.index(other_name)]
+
+            result = []
+            for i in range(0, self.get_dim_size(axis_name)):
+                key_coord[axis_name] = i
+                result.append(self._coord2rank[self.coordinate(**key_coord)])
+            all_result.append(result)
+
+        return all_result
+
+
+class HybridCommunicateGroup(object):
+    def __init__(self, topology):
+        self.nranks = paddle.distributed.get_world_size()
+        self.global_rank = paddle.distributed.get_rank()
+        self._topo = topology
+
+        self._dp_degree = self._topo.get_dim('data')
+        self._mp_degree = self._topo.get_dim('model')
+        self._pp_degree = self._topo.get_dim('pipe')
+
+        self._data_parallel_id = self._get_data_parallel_id()
+        self._model_parallel_id = self._get_model_parallel_id()
+
+        assert self._check_vaild_topo(
+        ), "Here is an unreasonable topogy setting. world_size: {}, but" \
+            "dp_num: {}, mp_num: {}, pp_num: {}".format(self.nranks, self._dp_degree,
+            self._mp_degree, self._pp_degree)
+
+        # create comm group for data parallel
+        self._dp_group, self._dp_comm_group = self._set_comm_group("data")
+        print("data parallel group", self._dp_group, file=sys.stderr)
+
+        # create comm group for model parallel
+        self._mp_group, self._mp_comm_group = self._set_comm_group("model")
+        print("data parallel group", self._mp_group, file=sys.stderr)
+
+        global _HYBRID_PARALLEL_GROUP
+        _HYBRID_PARALLEL_GROUP = self
+
+    def _check_vaild_topo(self):
+        return self._dp_degree * self._mp_degree * self._pp_degree == self.nranks
+
+    def _set_comm_group(self, parallel_method="data"):
+        parallel_group = []
+        parallel_comm_group = None
+        parallel_groups = self._topo.get_comm_list(parallel_method)
+
+        for group in parallel_groups:
+            comm_group = paddle.distributed.new_group(ranks=group)
+            if self.global_rank in group:
+                parallel_group = group
+                parallel_comm_group = comm_group
+
+        assert len(parallel_group) > 0
+        assert parallel_comm_group is not None
+
+        return parallel_group, parallel_comm_group
+
+    def topology(self):
+        return self._topo
+
+    def get_global_rank(self):
+        return self.global_rank
+
+    # data parallel message:
+    def _get_data_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).data
+
+    def get_data_parallel_rank(self):
+        return self._data_parallel_id
+
+    def get_data_parallel_world_size(self):
+        return self._dp_degree
+
+    def get_data_parallel_group(self):
+        return self._dp_comm_group
+
+    def get_data_parallel_group_src_rank(self):
+        return self._dp_comm_group.ranks[0]
+
+    # model parallel message:
+    def _get_model_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).model
+
+    def get_model_parallel_rank(self):
+        return self._model_parallel_id
+
+    def get_model_parallel_world_size(self):
+        return self._mp_degree
+
+    def get_model_parallel_group(self):
+        return self._mp_comm_group
+
+    def get_model_parallel_group_src_rank(self):
+        return self._mp_comm_group.ranks[0]
diff --git a/python/paddle/distributed/fleet/data_generator/test_data_generator.py b/python/paddle/distributed/fleet/data_generator/test_data_generator.py
deleted file mode 100644
index 60cbaf0bd36435..00000000000000
--- a/python/paddle/distributed/fleet/data_generator/test_data_generator.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-import paddle
-import paddle.distributed.fleet as fleet
-
-
-class SyntheticData(fleet.MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", [1, 2, 3, 4]), ("label", [0])
-
-        return data_iter
-
-
-class SyntheticStringData(fleet.MultiSlotStringDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield [("words", ["1", "2", "3", "4"]), ("label", ["0"])]
-
-        return data_iter
-
-
-sd = SyntheticData()
-sd.run_from_memory()
-
-sd2 = SyntheticStringData()
-sd2.run_from_memory()
diff --git a/python/paddle/distributed/fleet/dataset/__init__.py b/python/paddle/distributed/fleet/dataset/__init__.py
index af33c4eafb3968..24b68596f25419 100644
--- a/python/paddle/distributed/fleet/dataset/__init__.py
+++ b/python/paddle/distributed/fleet/dataset/__init__.py
@@ -12,3 +12,4 @@
 # See the License for the specific language governing permissions and
 
 from .dataset import *
+from .index_dataset import *
diff --git a/python/paddle/distributed/fleet/dataset/index_dataset.py b/python/paddle/distributed/fleet/dataset/index_dataset.py
new file mode 100644
index 00000000000000..dfd3daa9570b95
--- /dev/null
+++ b/python/paddle/distributed/fleet/dataset/index_dataset.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.fluid import core
+
+
+class Index(object):
+    def __init__(self, name):
+        self._name = name
+
+
+class TreeIndex(Index):
+    def __init__(self, name, path):
+        super(TreeIndex, self).__init__(name)
+        self._wrapper = core.IndexWrapper()
+        self._wrapper.insert_tree_index(name, path)
+        self._tree = self._wrapper.get_tree_index(name)
+        self._height = self._tree.height()
+        self._branch = self._tree.branch()
+        self._total_node_nums = self._tree.total_node_nums()
+        self._emb_size = self._tree.emb_size()
+        self._layerwise_sampler = None
+
+    def height(self):
+        return self._height
+
+    def branch(self):
+        return self._branch
+
+    def total_node_nums(self):
+        return self._total_node_nums
+
+    def emb_size(self):
+        return self._emb_size
+
+    def get_all_leafs(self):
+        return self._tree.get_all_leafs()
+
+    def get_nodes(self, codes):
+        return self._tree.get_nodes(codes)
+
+    def get_layer_codes(self, level):
+        return self._tree.get_layer_codes(level)
+
+    def get_travel_codes(self, id, start_level=0):
+        return self._tree.get_travel_codes(id, start_level)
+
+    def get_ancestor_codes(self, ids, level):
+        return self._tree.get_ancestor_codes(ids, level)
+
+    def get_children_codes(self, ancestor, level):
+        return self._tree.get_children_codes(ancestor, level)
+
+    def get_travel_path(self, child, ancestor):
+        res = []
+        while (child > ancestor):
+            res.append(child)
+            child = int((child - 1) / self._branch)
+        return res
+
+    def get_pi_relation(self, ids, level):
+        codes = self.get_ancestor_codes(ids, level)
+        return dict(zip(ids, codes))
+
+    def init_layerwise_sampler(self,
+                               layer_sample_counts,
+                               start_sample_layer=1,
+                               seed=0):
+        assert self._layerwise_sampler is None
+        self._layerwise_sampler = core.IndexSampler("by_layerwise", self._name)
+        self._layerwise_sampler.init_layerwise_conf(layer_sample_counts,
+                                                    start_sample_layer, seed)
+
+    def layerwise_sample(self, user_input, index_input, with_hierarchy=False):
+        if self._layerwise_sampler is None:
+            raise ValueError("please init layerwise_sampler first.")
+        return self._layerwise_sampler.sample(user_input, index_input,
+                                              with_hierarchy)
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 0f9b13d8a1271f..13b793d3ad1701 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -73,6 +73,7 @@
 # TODO(danleifeng): Don't import * from a module
 from paddle.distributed.fleet.launch_utils import *
 import paddle.distributed.fleet.cloud_utils as cloud_utils
+import paddle.distributed.fleet.ascend_utils as ascend_utils
 
 
 def _print_arguments(args):
@@ -108,6 +109,12 @@ def _parse_args():
         "In gpu training, it should be less or equal to the gpus number of you system(or you set by --gpus). And so each process can"
         " bound to one or average number of gpus.")
 
+    base_group.add_argument(
+        "--run_mode",
+        type=str,
+        default="collective",
+        help="run mode of job, can be:collective/ps/ps-heter")
+
     if fluid.core.is_compiled_with_cuda():
         base_group.add_argument(
             "--gpus",
@@ -222,6 +229,12 @@ def launch_collective(args):
         cluster, pod = cloud_utils.get_cloud_cluster(
             args.ips, device_mode, devices_per_proc, start_port)
         logger.debug("get cluster from cloud:{}".format(cluster))
+    elif device_mode == DeviceMode.ASCEND_NPU:
+        # for ascend
+        cluster, pod = ascend_utils.get_cloud_cluster(
+            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
+            device_mode=device_mode,
+            start_port=start_port)
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
         cluster, pod = get_cluster_from_args(args, device_mode,
@@ -243,6 +256,9 @@ def launch_collective(args):
         log_dir=args.log_dir,
         envs=global_envs)
 
+    for idx, proc in enumerate(procs):
+        print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
+
     while True:
         alive = watch_local_trainers(procs, cluster.trainers_nranks())
 
@@ -276,6 +292,16 @@ def launch_ps(args, distribute_mode):
 
 
 def which_distributed_mode(args):
+    if args.run_mode is not None:
+        assert args.run_mode in ["collective", "ps", "ps-heter"]
+
+    if args.run_mode == "collective":
+        return DistributeMode.COLLECTIVE
+    elif args.run_mode == "ps":
+        return DistributeMode.PS
+    elif args.run_mode == "ps-heter":
+        return DistributeMode.PS_HETER
+
     ps_args = [
         '--worker_num', '--server_num', '--heter_worker_num', '--servers',
         '--workers', '--heter_workers', '--http_port'
@@ -298,24 +324,26 @@ def which_distributed_mode(args):
         )
 
     if fluid.core.is_compiled_with_cuda():
-        device_count = fluid.core.get_cuda_device_count()
+        accelerators = fluid.core.get_cuda_device_count()
+    elif fluid.core.is_compiled_with_ascend():
+        accelerators = fluid.core.NPUDevice.get_device_count()
     elif fluid.core.is_compiled_with_xpu():
-        device_count = fluid.core.get_xpu_device_count()
+        accelerators = fluid.core.get_xpu_device_count()
     else:
-        device_count = 0
+        accelerators = 0
 
     if len(has_ps_args) > 0:
         logger.info(
-            "Run parameter-sever mode. pserver arguments:{}, cuda or xpu count:{}".
-            format(has_ps_args, device_count))
+            "Run parameter-sever mode. pserver arguments:{}, accelerators count:{}".
+            format(has_ps_args, accelerators))
         has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args))
         if len(has_ps_heter_args) > 0:
             return DistributeMode.PS_HETER
         else:
             return DistributeMode.PS
     elif len(has_collective_args) > 0:
-        logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
-                    format(has_collective_args, device_count))
+        logger.info("Run collective mode. gpu arguments:{}, cuda count:{}".
+                    format(has_collective_args, accelerators))
         return DistributeMode.COLLECTIVE
     else:
         if not fluid.core.is_compiled_with_cuda(
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index c5cb1ec94ac3d0..b4d5c58abbf2e5 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -52,6 +52,8 @@ class DeviceMode():
     GPU = 1
     KUNLUN = 2
     XPU = 2
+    ASCEND_NPU = 3
+    UNKNOWN = 3
 
 
 class Cluster(object):
@@ -98,6 +100,14 @@ def trainers_endpoints(self):
                 r.append(t.endpoint)
         return r
 
+    def world_device_ids(self):
+        r = []
+        for pod in self.pods:
+            for t in pod.trainers:
+                str_accelerators = [str(acc) for acc in t.accelerators]
+                r.append(str_accelerators)
+        return r
+
     def pods_endpoints(self):
         r = []
         for pod in self.pods:
@@ -105,7 +115,6 @@ def pods_endpoints(self):
             assert pod.port != None and pod.addr != None, "{} not a valid endpoint".format(
                 ep)
             r.append(ep)
-
         return r
 
     def get_pod_by_id(self, pod_id):
@@ -132,23 +141,23 @@ def __ne__(self, j):
 
 class Trainer(object):
     def __init__(self):
-        self.gpus = []
+        self.accelerators = []
         self.endpoint = None
         self.rank = None
 
     def __str__(self):
-        return "gpu:{} endpoint:{} rank:{}".format(self.gpus, self.endpoint,
-                                                   self.rank)
+        return "accelerator:{} endpoint:{} rank:{}".format(
+            self.accelerators, self.endpoint, self.rank)
 
     def __eq__(self, t):
-        if len(self.gpus) != len(t.gpus):
+        if len(self.accelerators) != len(t.accelerators):
             return False
 
         if self.endpoint != t.endpoint or \
                 self.rank != t.rank:
             return False
 
-        for a, b in zip(self.gpus, t.gpus):
+        for a, b in zip(self.accelerators, t.accelerators):
             if a != b:
                 return False
 
@@ -171,12 +180,13 @@ def __init__(self):
         self.servers = []
         self.workers = []
         self.heter_workers = []
-        self.gpus = []
+        self.accelerators = []
+        self.device_mode = None
 
     def __str__(self):
-        return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
+        return "rank:{} id:{} addr:{} port:{} visible_accelerator:{} trainers:{} servers:{} \
             workers:{} heter_workers:{}".format(
-            self.rank, self.id, self.addr, self.port, self.gpus, [
+            self.rank, self.id, self.addr, self.port, self.accelerators, [
                 str(t) for t in self.trainers
             ], [str(s) for s in self.servers], [str(w) for w in self.workers],
             [str(h) for h in self.heter_workers])
@@ -231,12 +241,12 @@ def parse_response(self, res_pods):
     def rank(self):
         return self.rank
 
-    def get_visible_gpus(self):
+    def get_visible_accelerators(self):
         r = ""
-        for g in self.gpus:
+        for g in self.accelerators:
             r += "{},".format(g)
 
-        assert r != "", "this pod {} can't see any gpus".format(self)
+        assert r != "", "this pod {} can't see any accelerators".format(self)
 
         r = r[:-1]
         return r
@@ -264,23 +274,27 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        pod.device_mode = device_mode
+
         cur_node_endpoints = trainer_endpoints[node_rank]
         # when use paddlecloud, endpoints may > devices_per_proc(user_defined)
         assert len(cur_node_endpoints) >= len(
             devices_per_proc
-        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
+        ), "current trainer_endpoints size should be greater equal than acclerators size."
         for i in range(len(devices_per_proc)):
             trainer = Trainer()
-            if device_mode == DeviceMode.GPU:
+            if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
-                    trainer.gpus.extend(devices_per_proc[i])
+                    trainer.accelerators.extend(devices_per_proc[i])
+                    pod.accelerators.extend(devices_per_proc[i])
                 else:
-                    trainer.gpus.append(devices_per_proc[i])
+                    trainer.accelerators.append(devices_per_proc[i])
+                    pod.accelerators.append(devices_per_proc[i])
             elif device_mode == DeviceMode.XPU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
-                    trainer.gpus.extend(devices_per_proc[i])
+                    trainer.accelerators.extend(devices_per_proc[i])
                 else:
-                    trainer.gpus.append(devices_per_proc[i])
+                    trainer.accelerators.append(devices_per_proc[i])
             trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
@@ -451,21 +465,37 @@ def start_local_trainers(cluster,
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
 
+    ids = cluster.world_device_ids()
+    res = [':'.join(ele) for ele in ids]
     procs = []
     for idx, t in enumerate(pod.trainers):
         proc_env = {
             "PADDLE_TRAINER_ID": "%d" % t.rank,
             "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
+            "PADDLE_RANK_IN_NODE": str(idx),
+            "PADDLE_LOCAL_DEVICE_IDS":
+            ",".join([str(acc) for acc in t.accelerators]),
+            "PADDLE_WORLD_DEVICE_IDS": ",".join(res),
         }
 
-        if fluid.core.is_compiled_with_cuda() and len(t.gpus) > 0:
+        if len(t.accelerators) > 0 and pod.device_mode == DeviceMode.GPU:
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
-                [str(g) for g in t.gpus])
-        elif fluid.core.is_compiled_with_xpu() and len(t.gpus) > 0:
+                [str(g) for g in t.accelerators])
+
+        elif len(t.
+                 accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU:
+            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
+
+        if len(t.accelerators) > 0:
+            proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
+        # to do: same code style in future
+        if fluid.core.is_compiled_with_xpu() and len(t.accelerators) > 0:
             proc_env["FLAGS_selected_xpus"] = "%s" % ",".join(
-                [str(g) for g in t.gpus])
+                [str(g) for g in t.accelerators])
 
         current_env.update(proc_env)
 
@@ -623,11 +653,17 @@ def get_xpus(xpus):
 
 
 def get_device_mode():
-    if fluid.core.is_compiled_with_cuda() and fluid.core.get_cuda_device_count(
-    ) > 0:
-        print("launch train in GPU mode")
+    if fluid.core.is_compiled_with_ascend() and \
+            fluid.core.NPUDevice.get_device_count() > 0:
+        print("launch train in ascend npu mode!")
+        return DeviceMode.ASCEND_NPU
+
+    if fluid.core.is_compiled_with_cuda() and \
+            fluid.core.get_cuda_device_count() > 0:
+        print("launch train in GPU mode!")
         return DeviceMode.GPU
-    elif fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count(
+
+    if fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count(
     ) > 0:
         print("launch train in XPU mode")
         return DeviceMode.XPU
@@ -654,6 +690,8 @@ def get_device_proc_info(args):
             ]
         else:
             devices_per_proc = gpus
+    elif device_mode == DeviceMode.ASCEND_NPU:
+        devices_per_proc = None
     elif device_mode == DeviceMode.XPU:
         xpus = get_xpus(args.xpus)
         if args.nproc_per_node is not None:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
old mode 100644
new mode 100755
index dba3c944f70ab8..02505e01197dc6
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -59,6 +59,7 @@ def _init_wrapped_opt(self):
         is_distributed = self.role_maker._worker_num() > 1
         if self.user_defined_strategy.sharding:
             # FIXME(wangxi). sharding failed when split check_finite_and_unscale
+            # FIXME(JZ-LIANG). To support Sharding-Megatron-AMP, Megatron should follow Sharding's behavior that to disable is_distributed.
             is_distributed = False
         self.wrapped_opt._set_distributed(is_distributed)
 
diff --git a/paddle/fluid/train/demo/clean.sh b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
old mode 100755
new mode 100644
similarity index 81%
rename from paddle/fluid/train/demo/clean.sh
rename to python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
index a2064492c08b84..b9a7651e449096
--- a/paddle/fluid/train/demo/clean.sh
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
@@ -1,6 +1,4 @@
-#!/bin/bash
-
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-set -x
-cd "$(dirname "$0")"
-rm -rf build/
-set +x
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
index d7ac81bb5c584a..824225fd776d13 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
@@ -12,16 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Optimizer
 import paddle.fluid.core as core
 import numpy as np
-import ascend_parser
+from . import ascend_parser
+from paddle.distributed import fleet
+import hccl.manage.api as hccl
+from collections import namedtuple
+
+HcomGroupConfig = namedtuple('HcomGroupConfig', ['name', 'nranks', 'rank_ids'])
 
 
 class AscendIRParser(object):
-    def __init__(self):
+    def __init__(self, auto_dp=False, world_rank_size=1):
         self.graph_idx = 0
+        self.hcom_endpoints = {}
+        self.groups_to_create = []
+        self._auto_dp = auto_dp
+        self._world_rank_size = world_rank_size
 
     def _construct_input_map(self, input_varlist):
         ret_map = {}
@@ -43,15 +53,52 @@ def _construct_input_map(self, input_varlist):
                 ret_map[var.name] = ge_input
         return ge_in_operator, ret_map
 
+    def _endpoint_to_world_rank_id(self, endpoint):
+        world_endpoints = fleet.worker_endpoints()
+        assert endpoint in world_endpoints, "endpoint (%s) not in worker_endpoints (%s) " % (
+            endpoint, fleet.world_device_ids())
+        return world_endpoints.index(endpoint)
+
     def parse_op(self, op):
-        if op.type in ascend_parser.registerd_op:
-            print("Op[%s] has been registered, begin to parse it" % (op.type))
+        if op.type == 'c_gen_nccl_id':
+            endpoint = op.attr("endpoint")
+            other_endpoints = op.attr("other_endpoints")
+            rank = op.attr("rank")
+
+            nccl_id = op.output_arg_names[0]
+
+            # c_gen_nccl_id operator splits endpoints into local endpoint and other_endpoints
+            # we should combine these together to produce world_rank_ids 
+            self.hcom_endpoints[nccl_id] = other_endpoints[:]
+            self.hcom_endpoints[nccl_id].insert(rank, endpoint)
+
+            print("nccl_id (%s) registered endpoints %s" %
+                  (nccl_id, self.hcom_endpoints[nccl_id]))
+        elif op.type == 'c_comm_init':
+            nccl_id = op.input_arg_names[0]
+            nranks = op.attr("nranks")
+            assert nranks == len(self.hcom_endpoints[
+                nccl_id]), "nranks doesn't match endpoint count"
+            rank = op.attr("rank")
+            ring_id = op.attr("ring_id")
+
+            group_name = "hcom_group_" + str(ring_id)
+            global_rank_ids = [
+                self._endpoint_to_world_rank_id(endpoint)
+                for endpoint in self.hcom_endpoints[nccl_id]
+            ]
+            self.groups_to_create.append(
+                HcomGroupConfig(
+                    name=group_name, nranks=nranks, rank_ids=global_rank_ids))
+            print("append to create group: %s, with rank_ids: %s" %
+                  (group_name, global_rank_ids))
+        elif op.type in ascend_parser.registerd_op:
             op_parser = self.parser_factory.create_parse(
                 ascend_parser.registerd_op[op.type])
             op_parser.apply(op)
         else:
-            print("Op[%s] has not been registered, so we have to skip it" %
-                  (op.type))
+            assert False, "Op[%s] has not been registered, so we have to skip it" % (
+                op.type)
 
     def _parse_program(self,
                        graph_name,
@@ -84,7 +131,7 @@ def _parse_program(self,
                 name = e.name
             ge_out_operator.append(self.var2geop[name])
 
-        # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as: 
+        # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as:
         # if graph_name == "main":
         #     ge_out_operator.append(self.var2geop["reduce_sum_0.tmp_0@GRAD"])
 
@@ -115,6 +162,17 @@ def parse_program(self, startup_program, main_program, input_varlist,
         startup_graph = self._parse_program("startup", startup_program)
         main_graph = self._parse_program("main", main_program, input_varlist,
                                          fetch_list)
+        if self._auto_dp and self._world_rank_size > 1:
+            assert len(self.groups_to_create
+                       ) == 0, "can't parse program under auto_dp mode"
+
+            from paddle.distributed import fleet
+            self.groups_to_create.append(
+                HcomGroupConfig(
+                    name="hcom_group_0",
+                    nranks=fleet.world_size(),
+                    rank_ids=[x for x in range(fleet.world_size())]))
+
         return startup_graph, main_graph
 
 
@@ -124,9 +182,14 @@ class AscendOptimizer(Optimizer):
     def __init__(self, optimizer, fetch_list=[]):
         self.inner_opt = optimizer
         self.fetch_list = fetch_list
+        self.ascend_instance = None
 
     def __del__(self):
+        print("begin AscendOptimizer del")
+        if self.ascend_instance is not None:
+            self.ascend_instance.destroy_global_resources()
         core.ge_finalize()
+        print("end AscendOptimizer del")
 
     def _can_apply(self):
         if not self.user_defined_strategy.ascend:
@@ -138,7 +201,7 @@ def _disable_strategy(self, dist_strategy):
         dist_strategy.ascend = False
         dist_strategy.ascend_configs = {}
 
-    def _get_input_varlist(program):
+    def _get_input_varlist(self, program):
         ret_list = []
         for var in program.list_vars():
             if var.is_data or var.persistable:
@@ -149,30 +212,57 @@ def minimize(self,
                  loss,
                  startup_program=None,
                  parameter_list=None,
-                 no_grad_set=None):
-        minimized = self.inner_opt.minimize(
-            loss, startup_program=startup_program)
+                 no_grad_set=None,
+                 auto_dp=False,
+                 rank_table_file=None,
+                 precision_mode="must_keep_origin_dtype"):
+        minimized = None
+        if self.inner_opt:
+            minimized = self.inner_opt.minimize(
+                loss, startup_program=startup_program)
 
         self.ascend_instance = core.AscendInstance()
 
+        from paddle.distributed import fleet
+        if auto_dp and fleet.world_size() > 1:
+            from paddle.fluid.transpiler import ascend_transpiler
+            t = ascend_transpiler.AscendTranspiler(startup_program,
+                                                   loss.block.program)
+            t.transpile()
+            #print(loss.block.program)
+
         # Config about Graph Engine can be found in https://support.huaweicloud.com/
         config = {
-            "ge.exec.deviceId": "0",
+            "ge.exec.deviceId": str(fleet.local_device_ids()),
             "ge.graphRunMode": "1",
-            "ge.exec.precision_mode": "must_keep_origin_dtype"
+            "ge.exec.precision_mode": precision_mode,
         }
+        # if multi trainers
+        if rank_table_file and fleet.world_size() > 1:
+            config["ge.exec.rankTableFile"] = rank_table_file
+            config["ge.exec.rankId"] = str(fleet.worker_index())
+            config["ge.exec.isUseHcom"] = "1"
+            config["ge.exec.deployMode"] = "0"
+        print("ge_initialize config:", config)
         core.ge_initialize(config)
 
         # Init Session
         self.ascend_instance.init_global_resources()
 
         main_block = loss.block
-        self.parser = AscendIRParser()
+        self.parser = AscendIRParser(
+            auto_dp=auto_dp, world_rank_size=fleet.world_size())
+
+        input_varlist = self._get_input_varlist(main_block.program)
 
-        input_varlist = _get_input_varlist(main_block.program)
         startup_graph, main_graph = self.parser.parse_program(
             startup_program, main_block.program, input_varlist, self.fetch_list)
 
+        for cfg in self.parser.groups_to_create:
+            print("create group (%s), nranks: %d, rank_ids: %s" %
+                  (cfg.name, cfg.nranks, cfg.rank_ids))
+            hccl.create_group(cfg.name, cfg.nranks, cfg.rank_ids)
+
         self.ascend_instance.add_ascend_subgraph(0, startup_graph)
         self.ascend_instance.add_ascend_subgraph(1, main_graph)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
index 2c5930c5b9f2fc..19b5e910db2993 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -1,41 +1,106 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Optimizer
 import paddle.fluid.core as core
 import numpy as np
-
-registerd_op = {
-    "elementwise_add": "AddParser",
-    "matmul": "MatMulParser",
-    "mul": "MulParser",
-    "relu": "ReluParser",
-    "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
-    "shape": "ShapeParser",
-    "fill_constant": "FillConstantParser",
-    "reduce_sum": "ReduceSumParser",
-    "reduce_sum_grad": "ReduceSumGradParser",
-    "matmul_grad": "MatMulGradParser",
-    "mul_grad": "MulGradParser",
-    "reshape2": "ReshapeParser",
-    "scale": "ScaleParser",
-    "relu_grad": "ReluGradParser",
-    "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
-    "truncated_gaussian_random": "TruncatedNormalParser",
-    "sgd": "SGDParser"
-}
+from paddle.distributed import fleet
+from functools import reduce
+
+registerd_op = {## forwards
+                "elementwise_add": "AddParser",
+                "matmul": "MatMulParser",
+                "mul": "MulParser",
+                "relu": "ReluParser",
+                "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
+                "shape": "ShapeParser",
+                "fill_constant": "FillConstantParser",
+                "reduce_sum": "ReduceSumParser",
+                "elementwise_mul": "DotMulParser",
+                "elementwise_div": "DotDivParser",
+                "elementwise_pow": "DotPowParser",
+                "elementwise_max": "MaxParser",
+                "elementwise_min": "MinParser",
+                "elementwise_sub": "DotSubParser",
+                "pow": "PowParser",
+                "gelu": "GeluParser",
+                "sqrt": "SqrtParser",
+                "log": "LogParser",
+                "sum": "SumParser",
+                "logical_not": "LogicalNotParser",
+                "gather": "GatherParser",
+                "scatter": "ScatterParser",
+                "cast": "CastParser",
+                "tanh": "TanhParser",
+                "stack": "StackParser",
+                "square": "SquareParser",
+                "unsqueeze2": "UnSqueezeParser",
+                "assign": "AssignParser",
+                "softmax": "SoftMaxParser",
+                "reshape2": "ReshapeParser",
+                "transpose2": "TransposeParser",
+                "layer_norm": "LayerNormParser",
+                "less_than": "LessParser",
+                "mean": "MeanParser",
+                "scale": "ScaleParser",
+                "slice": "SliceParser",
+                "top_k": "TopkParser",
+                "accuracy": "AccuracyParser",
+                #"increment": "IncrementParser",
+                "lookup_table": "LookupTableParser",
+                "truncated_gaussian_random": "TruncatedNormalParser",
+                "c_allgather": "AllGatherParser",
+                "c_allreduce_sum": "AllReduceSumParser",
+                "c_allreduce_max": "AllReduceMaxParser",
+                "c_broadcast": "BroadcastParser",
+                "c_reduce_scatter": "ReduceScatterParser",
+                "c_send": "SendParser",
+                "c_receive": "ReceiveParser",
+                "uniform_random": "UniformRandomParser",
+                "range": "RangeParser",
+                "equal": "EqualParser",
+                "expand": "ExpandParser",
+                "squeeze2": "SqueezeParser",
+
+
+                ## backwords
+                "matmul_grad": "MatMulGradParser",
+                "mul_grad": "MulGradParser",
+                "relu_grad": "ReluGradParser",
+                "reduce_sum_grad": "ReduceSumGradParser",
+                "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
+                "tanh_grad":"TanhGradParser",
+                "log_grad":"LogGradParser",
+                "pow_grad": "PowGradParser",
+                "sqrt_grad": "SqrtGradParser",
+                "gelu_grad": "GeluGradParser",
+                "mean_grad": "MeanGradParser",
+                'lookup_table_grad': "LookUpTableGradParser",
+                "elementwise_mul_grad": "DotMulGradParser",
+                "elementwise_add_grad": "DotAddGradParser",
+                "elementwise_div_grad": "DotDivGradParser",
+                "softmax_grad": "SoftmaxGradParser",
+                "slice_grad": "SliceGradParser",
+                "reshape2_grad": "ReshapeGradParser",
+                "gather_grad": "GatherGradParser",
+                "transpose2_grad": "TransposeGradParser",
+                "layer_norm_grad": "LayerNormGradParser",
+
+                ## opt
+                "sgd": "SGDParser",
+                #"adam": "AdamParser",
+                }
 global_cnt = -1
 global_input_cnt = -1
 
@@ -60,6 +125,7 @@ def __init__(self):
             5: "float32",
             6: "float64"
         }
+        self.dtype2paddle_inv_map = {"VarType.FP32": 0, "VarType.FP16": 1}
 
     def dtype2ge(self, dtype):
         assert dtype in self.dtype2ge_map, "dtype[%d] is not supported %d" % (
@@ -105,7 +171,6 @@ def update_output(self, geop_list, index_list):
             self.parser_name, len(index_list), output_num)
         for output_id in range(output_num):
             arguments = self.op.output(self.op.output_names[output_id])
-            print("%d argument:  %s" % (output_id, str(arguments)))
             if len(arguments) > 0:
                 assert len(arguments) == len(
                     index_list[output_id]
@@ -113,8 +178,6 @@ def update_output(self, geop_list, index_list):
                     self.parser_name, output_id, len(index_list[output_id]),
                     len(arguments))
                 for i in range(len(arguments)):
-                    print("assgin index_list[%d][%d] to %s" %
-                          (output_id, i, arguments[i]))
                     self.var2geop[arguments[i]] = geop_list[index_list[
                         output_id][i]]
 
@@ -125,7 +188,7 @@ def apply(self, op):
         self.op = op
         assert self.op.type == self.parser_name, "op [%s] != parser_name[%s]" % (
             self.op.type, self.parser_name)
-        print("begin to parse op %s" % (self.parser_name))
+        #print("begin to parse op %s" % (self.parser_name))
         geop_list, index_list = self._apply()
         self.update_output(geop_list, index_list)
 
@@ -137,7 +200,8 @@ def _mark_as_input(self, ge_tensor):
     def _accumulated_op_id(self):
         global global_cnt
         global_cnt += 1
-        return "." + str(global_cnt)
+        name = "." + str(global_cnt)
+        return name
 
     def _create_ge_tensor(self, shape, dtype, value):
         tensor_desc = core.GETensorDesc(
@@ -152,6 +216,63 @@ def _create_ge_tensor(self, shape, dtype, value):
         tensor.set_data(data_8)
         return tensor
 
+    def _get_ge_tensor(self, shape, dtype, value_list):
+        tensor_desc = core.GETensorDesc(
+            core.GEShape(shape), core.GEFormat.FORMAT_ND,
+            self.ascend_helper.dtype2ge(dtype))
+        tensor = core.GETensor(tensor_desc)
+
+        data = np.array(value_list).reshape(shape).astype(
+            self.ascend_helper.dtype2np(dtype))
+        buf = data.tobytes()
+        data_8 = np.frombuffer(buf, dtype=np.uint8)
+        tensor.set_data(data_8)
+
+        tensor_const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+
+        return tensor_const
+
+    def _get_variable(self, shape, dtype, tensor):
+        if dtype == "int32":
+            type = core.GEDataType.DT_INT32
+        elif dtype == "float32":
+            type = core.GEDataType.DT_FLOAT
+
+        var = core.GEOperatorFactory.create_operator(
+            "variable" + self._accumulated_op_id(), "Variable")
+        var.update_output_desc("y",
+                               core.GETensorDesc(
+                                   core.GEShape(shape), core.GEFormat.FORMAT_ND,
+                                   type))
+        assign = core.GEOperatorFactory.create_operator(
+            "assign" + self._accumulated_op_id(), "Assign").set_input(
+                "value", tensor).set_input("ref", var)
+
+        return assign
+
+    def _create_shape_tensor(self):
+        tensor_desc = core.GETensorDesc(
+            core.GEShape([2]), core.GEFormat.FORMAT_ND,
+            core.GEDataType.DT_INT32)
+        tensor = core.GETensor(tensor_desc)
+
+        data = np.ones((2)).astype("int32").reshape([2])
+        data[0] = 64
+        buf = data.tobytes()
+        data_8 = np.frombuffer(buf, dtype=np.uint8)
+        tensor.set_data(data_8)
+        return tensor
+
+    def _get_GEtensor_shape(self, tensor):
+        tensor_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", tensor)
+        tensor_shape = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", tensor_shape).set_attr_int32("dst_type", 0)
+        return tensor_shape
+
 
 class AddParser(AscendParserBase):
     def __init__(self, graph, var2geop):
@@ -162,109 +283,276 @@ def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
         y = self._get_ge_input(self.op.input_arg_names[1])
         add = core.GEOperatorFactory.create_operator(
-            "add" + self._accumulated_op_id(), "Add").set_input(
-                "x1", x).set_input("x2", y)
+            "add" + self._accumulated_op_id(),
+            "Add").set_input("x1", x).set_input("x2", y)
         return [add], [[0]]
 
 
-class ReduceSumParser(AscendParserBase):
+class DotSubParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReduceSumParser, self).__init__(graph, var2geop)
-        self.parser_name = "reduce_sum"
+        super(DotSubParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_sub"
 
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
-        axes = self.op.attr("dim")
-        keep_dims = self.op.attr("keep_dim")
-        reduce_sum = core.GEOperatorFactory.create_operator(
-            "reduce_sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
-                "x", x, 0).set_attr_vec_int32("axes", axes).set_attr_bool(
-                    "keep_dims", keep_dims)
-        return [reduce_sum], [[0]]
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        sub = core.GEOperatorFactory.create_operator(
+            "sub" + self._accumulated_op_id(),
+            "Sub").set_input("x1", x).set_input("x2", y)
+        return [sub], [[0]]
 
 
-class ReduceSumGradParser(AscendParserBase):
+class DotMulParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReduceSumGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "reduce_sum_grad"
+        super(DotMulParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_mul"
 
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
-        input = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        mul = core.GEOperatorFactory.create_operator(
+            "dotmul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", x).set_input("x2", y)
+        return [mul], [[0]]
 
-        shape_tensor = core.GEOperatorFactory.create_operator(
-            "shape" + self._accumulated_op_id(), "Shape").set_input("x", input,
-                                                                    0)
-        axis_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", self._create_ge_tensor([1], 2, -1))
-        self._mark_as_input(axis_const)
 
-        broadcast = core.GEOperatorFactory.create_operator(
-            "broadcast_to_d" + self._accumulated_op_id(),
-            "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor)
-        # unsqueeze cannot get right result, but ExpandDims seems have the same functionality.
-        reduce_sum_grad = core.GEOperatorFactory.create_operator(
-            "expand" + self._accumulated_op_id(), "ExpandDims").set_input(
-                "x", broadcast).set_input("axis", axis_const)
-        return [shape_tensor, axis_const, broadcast, reduce_sum_grad], [[3]]
+class DotDivParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotDivParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_div"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        div = core.GEOperatorFactory.create_operator(
+            "dotdiv" + self._accumulated_op_id(),
+            "Div").set_input("x1", x).set_input("x2", y)
+        return [div], [[0]]
 
 
-class MatMulParser(AscendParserBase):
+class DotPowParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MatMulParser, self).__init__(graph, var2geop)
-        self.parser_name = "matmul"
+        super(DotPowParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_pow"
 
     def _apply(self):
-        x1 = self._get_ge_input(self.op.input_arg_names[0])
-        x2 = self._get_ge_input(self.op.input_arg_names[1])
-        matmul = core.GEOperatorFactory.create_operator(
-            "matmul" + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x1).set_input("x2", x2)
-        return [matmul], [[0]]
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        pow = core.GEOperatorFactory.create_operator(
+            "dotpow" + self._accumulated_op_id(),
+            "Pow").set_input("x1", x1).set_input("x2", y)
+        return [pow], [[0]]
 
 
-class MatMulGradParser(AscendParserBase):
+class LessParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MatMulGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "matmul_grad"
+        super(LessParser, self).__init__(graph, var2geop)
+        self.parser_name = "less_than"
 
     def _apply(self):
-        out_grad = self._get_ge_input(self.op.input_arg_names[0])
-        x = self._get_ge_input(self.op.input_arg_names[1])
-        y = self._get_ge_input(self.op.input_arg_names[2])
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        less_than = core.GEOperatorFactory.create_operator(
+            "less_than" + self._accumulated_op_id(),
+            "Less").set_input("x1", x).set_input("x2", y)
+        return [less_than], [[0]]
 
-        x_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", out_grad).set_input("x2", y).set_attr_bool(
-                    "transpose_x1", False).set_attr_bool("transpose_x2", True)
-        y_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", out_grad).set_attr_bool(
-                    "transpose_x1", True).set_attr_bool("transpose_x2", False)
-        return [x_grad, y_grad], [[0], [1]]
 
+class MaxParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MaxParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_max"
 
-class MulGradParser(AscendParserBase):
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        max_out = core.GEOperatorFactory.create_operator(
+            "max" + self._accumulated_op_id(),
+            "Maximum").set_input("x1", x).set_input("x2", y)
+        return [max_out], [[0]]
+
+
+class MinParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MulGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "mul_grad"
+        super(MinParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_min"
 
     def _apply(self):
-        out_grad = self._get_ge_input(self.op.input_arg_names[0])
-        x = self._get_ge_input(self.op.input_arg_names[1])
-        y = self._get_ge_input(self.op.input_arg_names[2])
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        min_out = core.GEOperatorFactory.create_operator(
+            "min" + self._accumulated_op_id(),
+            "Minimum").set_input("x1", x).set_input("x2", y)
+        return [min_out], [[0]]
 
-        x_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", out_grad).set_input("x2", y).set_attr_bool(
-                    "transpose_x1", False).set_attr_bool("transpose_x2", True)
-        y_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", out_grad).set_attr_bool(
-                    "transpose_x1", True).set_attr_bool("transpose_x2", False)
 
-        return [x_grad, y_grad], [[0], [1]]
+## cal
+class LogParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogParser, self).__init__(graph, var2geop)
+        self.parser_name = "log"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        log = core.GEOperatorFactory.create_operator(
+            "log" + self._accumulated_op_id(), "Log").set_input("x", x)
+        return [log], [[0]]
+
+
+class SqrtParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqrtParser, self).__init__(graph, var2geop)
+        self.parser_name = "sqrt"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sqrt = core.GEOperatorFactory.create_operator(
+            "sqrt" + self._accumulated_op_id(), "Sqrt").set_input("x", x)
+        return [sqrt], [[0]]
+
+
+class PowParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(PowParser, self).__init__(graph, var2geop)
+        self.parser_name = "pow"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        factor = self.op.attr("factor")
+        pow_value = core.GEOperatorFactory.create_operator(
+            "pow" + self._accumulated_op_id(),
+            "Power").set_input("x", x).set_attr_float(
+                "power", factor).set_attr_float("scale", 1.0).set_attr_float(
+                    "shift", 0.0)
+        return [pow_value], [[0]]
+
+
+class SquareParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SquareParser, self).__init__(graph, var2geop)
+        self.parser_name = "square"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        square = core.GEOperatorFactory.create_operator(
+            "square" + self._accumulated_op_id(), "Square").set_input("x", x)
+        return [square], [[0]]
+
+
+class SumParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SumParser, self).__init__(graph, var2geop)
+        self.parser_name = "sum"
+
+    def _apply(self):
+        len_list = len(self.op.input_arg_names)
+        if len_list < 2:
+            assert False, "the size of input list must large or equal 2"
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        sum = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(),
+            "Add").set_input("x1", x).set_input("x2", y)
+        for i in range(2, len_list):
+            y = self._get_ge_input(self.op.input_arg_names[i])
+            sum = core.GEOperatorFactory.create_operator(
+                "sum" + self._accumulated_op_id(),
+                "Add").set_input("x1", sum).set_input("x2", y)
+        return [sum], [[0]]
+
+
+class LogicalNotParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogicalNotParser, self).__init__(graph, var2geop)
+        self.parser_name = "logical_not"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        logical_not = core.GEOperatorFactory.create_operator(
+            "logical_not" + self._accumulated_op_id(),
+            "LogicalNot").set_input("x", x)
+        return [logical_not], [[0]]
+
+
+class MeanParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MeanParser, self).__init__(graph, var2geop)
+        self.parser_name = "mean"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        mean = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(),
+            "ReduceMeanD").set_input("x", x).set_attr_bool(
+                "keep_dims", False).set_attr_vec_int32("axes", [])
+        return [mean], [[0]]
+
+
+class ReduceSumParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceSumParser, self).__init__(graph, var2geop)
+        self.parser_name = "reduce_sum"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("dim")
+        keep_dims = self.op.attr("keep_dim")
+        reduce_all = self.op.attr("reduce_all")
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        if reduce_all:
+            axes = list(range(len(x_shape)))
+        reduce_sum = core.GEOperatorFactory.create_operator(
+            "reduce_sum" + self._accumulated_op_id(),
+            "ReduceSumD").set_input("x", x, 0).set_attr_vec_int32(
+                "axes", axes).set_attr_bool("keep_dims", keep_dims)
+        return [reduce_sum], [[0]]
+
+
+#class IncrementParser(AscendParserBase):
+#    def __init__(self, graph, var2geop):
+#        super(IncrementParser, self).__init__(graph, var2geop)
+#        self.parser_name = "increment"
+#
+#    def _apply(self): 
+#        x = self._get_ge_input(self.op.input_arg_names[0])
+#        step = self.op.attr("step") #self._get_ge_input(self.op.input_arg_names[1])
+#        print("step: ", step)
+#            
+#        increment = core.GEOperatorFactory.create_operator("adds" + self._accumulated_op_id(), "Adds").set_input("x", x).set_attr_float("value", step) #set_input("x2", bias)
+#        
+#        return [increment]
+
+
+## matrix cal
+class MatMulParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MatMulParser, self).__init__(graph, var2geop)
+        self.parser_name = "matmul"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        transpose_x = self.op.attr("transpose_X")
+        transpose_y = self.op.attr("transpose_Y")
+
+        x1_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        x2_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if len(x1_shape) > 2:
+            matmul = core.GEOperatorFactory.create_operator(
+                "matmul" + self._accumulated_op_id(), "BatchMatMul").set_input(
+                    "x1", x).set_input("x2", y).set_attr_bool(
+                        "adj_x1",
+                        transpose_x).set_attr_bool("adj_x2", transpose_y)
+        elif len(x1_shape) == 2:
+            matmul = core.GEOperatorFactory.create_operator(
+                "matmul" + self._accumulated_op_id(),
+                "MatMul").set_input("x1", x).set_input("x2", y).set_attr_bool(
+                    "transpose_x1", transpose_x).set_attr_bool("transpose_x2",
+                                                               transpose_y)
+        else:
+            assert False, "not support"
+        return [matmul], [[0]]
 
 
 class MulParser(AscendParserBase):
@@ -275,13 +563,105 @@ def __init__(self, graph, var2geop):
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
         y = self._get_ge_input(self.op.input_arg_names[1])
+        x_num_col_dims = self.op.attr("x_num_col_dims")
+        y_num_col_dims = self.op.attr("y_num_col_dims")
+        shape_x1 = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_x2 = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if x_num_col_dims == 1 and y_num_col_dims == 1:
+            if len(shape_x1) == 2 and len(shape_x2) == 2:
+                matmul = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input("x2", y)
+            elif len(shape_x1) == 3 and len(shape_x2) == 2:
+                flatten_x1 = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "Flatten").set_input("x", x)
+                matmul = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(), "MatMul").set_input(
+                        "x1", flatten_x1, 0).set_input("x2", y, 0)
+            else:
+                assert False, "not support"
+        else:
+            if len(shape_x1) == 3 and len(shape_x2) == 2:
+                assert x_num_col_dims == 2, "only support 2"
+                flatten_x1 = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", x).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+                matmul_m = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(), "MatMul").set_input(
+                        "x1", flatten_x1, 0).set_input("x2", y, 0)
+                matmul_transpose = core.GEOperatorFactory.create_operator(
+                    "transpose" + self._accumulated_op_id(),
+                    "TransposeD").set_input(
+                        "x", matmul_m).set_attr_vec_int32("perm", [1, 0])
+                tensor = self._create_ge_tensor(
+                    [3], 2, [shape_x2[1], shape_x1[0], shape_x1[1]])
+                const_shape = core.GEOperatorFactory.create_operator(
+                    "shape" + self._accumulated_op_id(),
+                    "Const").set_attr_tensor("value", tensor)
+                reshape_matmul = core.GEOperatorFactory.create_operator(
+                    "reshape" + self._accumulated_op_id(), "Reshape").set_input(
+                        "x", matmul_transpose).set_input(
+                            "shape", const_shape).set_attr_int32("axis", 0)
+                matmul = core.GEOperatorFactory.create_operator(
+                    "transpose" + self._accumulated_op_id(),
+                    "TransposeD").set_input(
+                        "x",
+                        reshape_matmul).set_attr_vec_int32("perm", [1, 2, 0])
+            else:
+                assert False, "not support"
 
-        matmul = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", y)
         return [matmul], [[0]]
 
 
+class LayerNormParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LayerNormParser, self).__init__(graph, var2geop)
+        self.parser_name = "layer_norm"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[2])
+        scale = self._get_ge_input(self.op.input_arg_names[1])
+        bias = self._get_ge_input(self.op.input_arg_names[0])
+        epsilon = self.op.attr("epsilon")
+        begin_norm_axis = self.op.attr("begin_norm_axis")
+        x_dtype = self.op.block.var(self.op.input_arg_names[2]).dtype
+
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+        scale_expand = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x",
+                                     scale).set_input("shape", shape_tensor)
+        bias_expand = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x", bias).set_input("shape", shape_tensor)
+        layer_norm = core.GEOperatorFactory.create_operator(
+            "layer_norm" + self._accumulated_op_id(),
+            "LayerNorm").set_input("x", x).set_input(
+                "gamma",
+                scale_expand).set_input("beta", bias_expand).set_attr_int32(
+                    "begin_norm_axis", begin_norm_axis).set_attr_int32(
+                        "begin_params_axis",
+                        begin_norm_axis).set_attr_float("epsilon", epsilon)
+
+        cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str(
+            x_dtype)] == 0 else 1
+        y = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 0).set_attr_int32("dst_type", cast_dtype)
+        mean = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 1).set_attr_int32("dst_type", cast_dtype)
+        variance = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 2).set_attr_int32("dst_type", cast_dtype)
+        return [y, mean, variance], [[1], [2], [0]]
+
+
+## activate function
 class ReluParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ReluParser, self).__init__(graph, var2geop)
@@ -294,20 +674,31 @@ def _apply(self):
         return [relu], [[0]]
 
 
-class ReluGradParser(AscendParserBase):
+class GeluParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReluGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "relu_grad"
+        super(GeluParser, self).__init__(graph, var2geop)
+        self.parser_name = "gelu"
 
     def _apply(self):
-        out = self._get_ge_input(self.op.input_arg_names[0])
-        out_grad = self._get_ge_input(self.op.input_arg_names[1])
-        relu_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input(
-                "gradients", out_grad).set_input("features", out)
-        return [relu_grad], [[0]]
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        gelu = core.GEOperatorFactory.create_operator(
+            "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x)
+        return [gelu], [[0]]
+
+
+class TanhParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TanhParser, self).__init__(graph, var2geop)
+        self.parser_name = "tanh"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        tanh = core.GEOperatorFactory.create_operator(
+            "tanh" + self._accumulated_op_id(), "Tanh").set_input("x", x)
+        return [tanh], [[0]]
 
 
+## loss function
 class SoftmaxWithCrossEntropyParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(SoftmaxWithCrossEntropyParser, self).__init__(graph, var2geop)
@@ -316,80 +707,61 @@ def __init__(self, graph, var2geop):
     def _apply(self):
         label = self._get_ge_input(self.op.input_arg_names[0])
         logits = self._get_ge_input(self.op.input_arg_names[1])
-
         cls_num = self.op.block.var(self.op.input_arg_names[1]).shape[1]
+
         softmax = core.GEOperatorFactory.create_operator(
-            "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input(
-                "x", logits)
+            "softmax" + self._accumulated_op_id(),
+            "SoftmaxV2").set_input("x", logits)
         label = core.GEOperatorFactory.create_operator(
             "cast" + self._accumulated_op_id(), "Cast").set_input(
                 "x", label).set_attr_int32("dst_type", 3)
 
         tensoron = self._create_ge_tensor([1], 5, 1)
-        on_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoron)
-        self._mark_as_input(on_const)
+        on = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
         tensoroff = self._create_ge_tensor([1], 5, 0)
-        off_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoroff)
-        self._mark_as_input(off_const)
+        off = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoroff)
+        self._mark_as_input(on)
+        self._mark_as_input(off)
         onehot = core.GEOperatorFactory.create_operator(
             "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
-                "x", label).set_input("on_value", on_const).set_input(
-                    "off_value", off_const).set_attr_int32("depth", cls_num)
+                "x", label).set_input("on_value", on).set_input(
+                    "off_value", off).set_attr_int32("depth", cls_num)
         squeeze = core.GEOperatorFactory.create_operator(
             "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
-        loss = core.GEOperatorFactory.create_operator(
+
+        loss_all = core.GEOperatorFactory.create_operator(
             "loss" + self._accumulated_op_id(),
             "SoftmaxCrossEntropyWithLogits").set_input(
                 "features", logits).set_input("labels", squeeze)
-
-        return [label, softmax, on_const, off_const, onehot, squeeze,
-                loss], [[6], [1]]
+        loss = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", loss_all, 0).set_attr_int32("dst_type", 0)
+        loss_expand = core.GEOperatorFactory.create_operator(
+            "unsqueeze" + self._accumulated_op_id(),
+            "Unsqueeze").set_input("x", loss).set_attr_vec_int32("axes", [1])
+        return [label, softmax, loss_expand], [[2], [1]]
 
 
-class SoftmaxWithCrossEntropyGradParser(AscendParserBase):
+class SoftMaxParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "softmax_with_cross_entropy_grad"
+        super(SoftMaxParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax"
 
     def _apply(self):
-        label = self._get_ge_input(self.op.input_arg_names[0])
-        loss_grad = self._get_ge_input(self.op.input_arg_names[1])
-        softmax = self._get_ge_input(self.op.input_arg_names[2])
-        cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1]
+        logits = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axis")
 
-        tensoron = self._create_ge_tensor([1], 5, 1)
-        on_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoron)
-        self._mark_as_input(on_const)
-        tensoroff = self._create_ge_tensor([1], 5, 0)
-        off_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoroff)
-        self._mark_as_input(off_const)
-        label = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", label).set_attr_int32("dst_type", 3)
-        onehot = core.GEOperatorFactory.create_operator(
-            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
-                "x", label).set_input("on_value", on_const).set_input(
-                    "off_value", off_const).set_attr_int32("depth", cls_num)
-        # the fuck onehot will add a demension, so must call squeeze afterward
-        squeeze = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
-        sub = core.GEOperatorFactory.create_operator(
-            "sub" + self._accumulated_op_id(), "Sub").set_input(
-                "x1", softmax).set_input("x2", squeeze)
-        grad = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Mul").set_input(
-                "x1", loss_grad).set_input("x2", sub)
-        return [on_const, off_const, label, onehot, squeeze, sub, grad], [[-1]]
+        softmax = core.GEOperatorFactory.create_operator(
+            "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input(
+                "x", logits).set_attr_vec_int32("axes", [axes])
+        return [softmax], [[0]]
 
 
+## general 
 class ShapeParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ShapeParser, self).__init__(graph, var2geop)
@@ -411,16 +783,15 @@ def _apply(self):
         shape = self.op.attr("shape")
         dtype = self.op.attr("dtype")
         value = self.op.attr("value")
-        print("shape: ", shape)
-        print("dtype: ", dtype)
-        print("value: ", value)
+
         tensor = self._create_ge_tensor(shape, dtype, value)
         const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor)
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
         self._mark_as_input(const)
         if self.op.block.var(self.op.output('Out')[0]).persistable:
-            print("%s fill_constant" % (self.op.output('Out')[0]))
+            #print("%s is Persistable in fill_constant" %
+            #      (self.op.output('Out')[0]))
             var = core.GEOperatorFactory.create_operator(
                 self.op.output('Out')[0], "Variable")
             var.update_output_desc("y",
@@ -432,26 +803,7 @@ def _apply(self):
                 "assign" + self._accumulated_op_id(), "Assign").set_input(
                     "value", const).set_input("ref", var)
             return [const], [[0]]
-        else:
-            print(
-                "self.op.output('Out')[0] is not persistable in fill_constant")
-            return [const], [[0]]
-
-
-class SGDParser(AscendParserBase):
-    def __init__(self, graph, var2geop):
-        super(SGDParser, self).__init__(graph, var2geop)
-        self.parser_name = "sgd"
-
-    def _apply(self):
-        grad = self._get_ge_input(self.op.input_arg_names[0])
-        lr = self._get_ge_input(self.op.input_arg_names[1])
-        param = self._get_ge_input(self.op.input_arg_names[2])
-        sgd = core.GEOperatorFactory.create_operator(
-            "momentum" + self._accumulated_op_id(),
-            "ApplyGradientDescent").set_input("var", param).set_input(
-                "alpha", lr).set_input("delta", grad)
-        return [sgd], [[0]]
+        return [const], [[0]]
 
 
 class TruncatedNormalParser(AscendParserBase):
@@ -465,30 +817,27 @@ def _apply(self):
         mean = self.op.attr("mean")
         std = self.op.attr("std")
         seed = self.op.attr("seed")
+
         tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
         shape_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor1)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor1)
         tensor2 = self._create_ge_tensor([1], dtype, mean)
         mean_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor2)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor2)
         tensor3 = self._create_ge_tensor([1], dtype, std)
         std_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor3)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor3)
         tensor4 = self._create_ge_tensor([1], dtype, mean - 2 * std)
         min_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor4)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor4)
         tensor5 = self._create_ge_tensor([1], dtype, mean + 2 * std)
         max_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor5)
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor5)
 
         self._mark_as_input(shape_tensor)
         self._mark_as_input(mean_tensor)
@@ -507,9 +856,8 @@ def _apply(self):
 
         ## wirte the output of truncatedNormal from startup_program to main_program
         if self.op.block.var(self.op.output('Out')[0]).persistable:
-            print("%s is Persistable in truncated_normal" %
-                  (self.op.output('Out')[0]))
-            #var = core.GEOperatorFactory.create_operator(self.op.output('Out')[0], "Variable").set_input("x", truncated_normal)
+            #print("%s is Persistable in truncated_normal" %
+            #      (self.op.output('Out')[0]))
             var = core.GEOperatorFactory.create_operator(
                 self.op.output('Out')[0], "Variable")
             var.update_output_desc("y",
@@ -524,66 +872,1317 @@ def _apply(self):
                 shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor,
                 truncated_normal
             ], [[-1]]
-        else:
-            print(
-                "self.op.output('Out')[0] is not persistable in truncated_noraml"
-            )
-            return [truncated_normal], [[0]]  #[assign]
+        #else:
+        #    print(
+        #        "self.op.output('Out')[0] is not persistable in truncated_noraml"
+        #    )
+        return [truncated_normal], [[0]]
 
 
-class ScaleParser(AscendParserBase):
+class GatherParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ScaleParser, self).__init__(graph, var2geop)
-        self.parser_name = "scale"
+        super(GatherParser, self).__init__(graph, var2geop)
+        self.parser_name = "gather"
 
     def _apply(self):
-        x = self._get_ge_input(self.op.input_arg_names[0])
-        scale = self.op.attr(
-            "scale")  #self.get_ge_input(self.op.input_arg_names[1])
-        bias = self.op.attr("bias")
-        bias_after_scale = self.op.attr("bias_after_scale")
-        if bias_after_scale:
-            scale_value = core.GEOperatorFactory.create_operator(
-                "scale" + self._accumulated_op_id(), "Power").set_input(
-                    "x", x).set_attr_float("power", 1.0).set_attr_float(
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        clo = self.op.block.var(self.op.input_arg_names[1]).shape[-1]
+
+        gather = core.GEOperatorFactory.create_operator(
+            "gather" + self._accumulated_op_id(), "Gather").set_input(
+                "x", x).set_input("indices", index).set_attr_bool(
+                    "validate_indices", True)
+        return [gather], [[0]]
+
+
+class ScatterParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ScatterParser, self).__init__(graph, var2geop)
+        self.parser_name = "scatter"
+
+    def _apply(self):
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        updates = self._get_ge_input(self.op.input_arg_names[2])
+        overwrite = self.op.attr("overwrite")
+        index_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+
+        if len(index_shape) == 1:
+            index = core.GEOperatorFactory.create_operator(
+                "unsqueeze" + self.getid(), "Unsqueeze").set_input(
+                    "x", index).set_attr_vec_int32("axes", [1])
+        if not overwrite:
+            scatter_value = core.GEOperatorFactory.create_operator(
+                "scatter" + self._accumulated_op_id(),
+                "TensorScatterAdd").set_input(
+                    "x", x_var).set_input("indices", index_var).set_input(
+                        "updates", updatesi_var)
+        else:
+            scatter_value = core.GEOperatorFactory.create_operator(
+                "scatter" + self._accumulated_op_id(),
+                "TensorScatterUpdate").set_input(
+                    "x", x_var).set_input("indices", index_var).set_input(
+                        "updates", updates_var)
+        return [x_var, index_var, updates_var, scatter_value], [[-1]]
+
+
+class CastParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(CastParser, self).__init__(graph, var2geop)
+        self.parser_name = "cast"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        dtype = self.op.attr("out_dtype")
+        cast = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x).set_attr_int32("dst_type", dtype)
+        return [cast], [[0]]
+
+
+class AssignParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AssignParser, self).__init__(graph, var2geop)
+        self.parser_name = "assign"
+
+    def _apply(self):
+        const = self._get_ge_input(self.op.input_arg_names[0])
+        var = self._get_ge_input(self.op.input_arg_names[1])
+        assign = core.GEOperatorFactory.create_operator(
+            "assign" + self._accumulated_op_id(), "Assign").set_input(
+                "value", const).set_input("ref", var)
+        return [assign], [[0]]
+
+
+class ScaleParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ScaleParser, self).__init__(graph, var2geop)
+        self.parser_name = "scale"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        scale = self.op.attr("scale")
+        bias = self.op.attr("bias")
+        bias_after_scale = self.op.attr("bias_after_scale")
+
+        if bias_after_scale:
+            scale_value = core.GEOperatorFactory.create_operator(
+                "scale" + self._accumulated_op_id(), "Power").set_input(
+                    "x", x).set_attr_float("power", 1.0).set_attr_float(
                         "scale", scale).set_attr_float("shift", bias)
         else:
             x_add_bias = core.GEOperatorFactory.create_operator(
                 "adds" + self._accumulated_op_id(), "Adds").set_input(
-                    "x", x).set_attr_float("value",
-                                           bias)  #set_input("x2", bias)
+                    "x", x).set_attr_float("value", bias)
             scale_value = core.GEOperatorFactory.create_operator(
                 "scale" + self._accumulated_op_id(), "Power").set_input(
-                    "x", x_add_bias).set_attr_float(
-                        "power", 1.0).set_attr_float(
-                            "scale", scale).set_attr_float("shift", 0.0)
-            #tensor_zeros = core.GEOperatorFactory.create_operator("zeroslike" + self.getid(), "ZerosLike").set_input("x", x)
-            #bias_ = self.create_ge_tensor([1], 5, bias)     
-            #const_bias = core.GEOperatorFactory.create_operator("const" + self.getid(), "Const").set_attr_tensor("value", tensor_bias)
+                    "x",
+                    x_add_bias).set_attr_float("power", 1.0).set_attr_float(
+                        "scale", scale).set_attr_float("shift", 0.0)
         return [scale_value], [[0]]
 
 
+class SliceParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SliceParser, self).__init__(graph, var2geop)
+        self.parser_name = "slice"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axes")
+        starts = self.op.attr("starts")
+        ends = self.op.attr("ends")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        len_shape = len(x_shape)
+        axes_cor = list(range(len_shape))
+        starts_cor, ends_cor = [], []
+        cnt = 0
+        for i in range(len_shape):
+            starts_cor.append(starts[cnt] if i in axes else 0)
+            if i in axes and ends[cnt] <= x_shape[i]:
+                ends_cor.append(ends[cnt])
+            else:
+                ends_cor.append(x_shape[i])
+            if i in axes:
+                cnt += 1
+        size = [ends_cor[i] - starts_cor[i] for i in range(len(axes_cor))]
+
+        assert len(axes_cor) == len(starts_cor) == len(
+            ends_cor), "the three fields must have same size"
+        slice_value = core.GEOperatorFactory.create_operator(
+            "slice" + self._accumulated_op_id(), "SliceD").set_input(
+                "x", x).set_attr_vec_int32(
+                    "offsets", starts_cor).set_attr_vec_int32("size", size)
+
+        return [slice_value], [[0]]
+
+
 class ReshapeParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ReshapeParser, self).__init__(graph, var2geop)
         self.parser_name = "reshape2"
 
     def _apply(self):
-        print("swbuf:", self.op.input_arg_names)
+        org_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        assert org_shape.count(-1) == 0, "do not allow the dim is -1"
         shape = self.op.attr("shape")
-        axis = 0
-        if shape[0] == -1:
-            axis = 1
-            shape = shape[1:]
-        print("shape: ", shape)
-        data_x1_shape = self._get_ge_input(self.op.input_arg_names[0])
+        for cnt in range(len(shape)):
+            if shape[cnt] == 0:
+                shape[cnt] = org_shape[cnt]
+
+        if -1 in shape:
+            assert shape.count(-1) == 1, "only allow one dim is -1"
+            mul_res_org = reduce(lambda x, y: x * y, org_shape)
+            mul_res_refine = reduce(lambda x, y: x * y, shape) * -1
+            idx = shape.index(-1)
+            shape[idx] = mul_res_org // mul_res_refine
+
+        x = self._get_ge_input(self.op.input_arg_names[0])
         tensor = self._create_ge_tensor([len(shape)], 2, shape)
         const_shape = core.GEOperatorFactory.create_operator(
-            "shape" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor)
+            "shape" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
         reshape = core.GEOperatorFactory.create_operator(
             "reshape" + self._accumulated_op_id(), "Reshape").set_input(
-                "x", data_x1_shape).set_input(
-                    "shape", const_shape).set_attr_int32("axis", axis)
+                "x",
+                x).set_input("shape", const_shape).set_attr_int32("axis", 0)
+        x_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+
+        return [x_shape, reshape], [[1], [0]]
+
+
+class TransposeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TransposeParser, self).__init__(graph, var2geop)
+        self.parser_name = "transpose2"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        perm = self.op.attr("axis")
+        transpose = core.GEOperatorFactory.create_operator(
+            "transpose" + self._accumulated_op_id(), "TransposeD").set_input(
+                "x", x).set_attr_vec_int32("perm", perm)
+        x_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+
+        return [x_shape, transpose], [[1], [0]]
+
+
+class AccuracyParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AccuracyParser, self).__init__(graph, var2geop)
+        self.parser_name = "accuracy"
+
+    def _apply(self):
+        pred = self._get_ge_input(self.op.input_arg_names[0])
+        label = self._get_ge_input(self.op.input_arg_names[1])
+        logits = self._get_ge_input(self.op.input_arg_names[2])
+
+        pred = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", pred).set_attr_int32("dst_type", 3)
+        label = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", label).set_attr_int32("dst_type", 3)
+        equal = core.GEOperatorFactory.create_operator(
+            "equal" + self._accumulated_op_id(), "Equal").set_input(
+                "x1", pred).set_input("x2", label)
+        cast = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", equal).set_attr_int32("dst_type", 0)
+        acc = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(), "ReduceMeanD").set_input(
+                "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32(
+                    "axes", [])
+        correct = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32(
+                    "axes", [])
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "oneslike" + self._accumulated_op_id(),
+            "OnesLike").set_input("x", label)
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", ones_tensor).set_attr_int32("dst_type", 0)
+        total = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", ones_tensor).set_attr_bool(
+                    "keep_dims", False).set_attr_vec_int32("axes", [])
+
+        return [acc, correct, total], [[0], [1], [2]]
+
+
+class TopkParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TopkParser, self).__init__(graph, var2geop)
+        self.parser_name = "top_k"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        k = self.op.attr("k")
+
+        tensor = self._create_ge_tensor([1], 2, k)
+        const_k = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        cast_x = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", x).set_attr_int32("dst_type", 1)
+        topk = core.GEOperatorFactory.create_operator(
+            "topk" + self._accumulated_op_id(),
+            "TopK").set_input("x", cast_x).set_input("k", const_k)
+        value = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", topk, 0).set_attr_int32("dst_type", 0)
+        index = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", topk, 1).set_attr_int32("dst_type", 0)
+        return [value, index], [[1], [0]]
+
+
+class LookupTableParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LookupTableParser, self).__init__(graph, var2geop)
+        self.parser_name = "lookup_table"
+
+    def _apply(self):
+        ids = self._get_ge_input(self.op.input_arg_names[0])
+        w = self._get_ge_input(self.op.input_arg_names[1])
+
+        ids_squeeze = core.GEOperatorFactory.create_operator(
+            "squeeze" + self._accumulated_op_id(), "Squeeze").set_input(
+                "x", ids).set_attr_vec_int32("axes", [-1])
+        out = core.GEOperatorFactory.create_operator(
+            "lookup" + self._accumulated_op_id(), "Gather").set_input(
+                "x", w).set_input("indices", ids_squeeze)
+        return [out], [[0]]
+
+
+class StackParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(StackParser, self).__init__(graph, var2geop)
+        self.parser_name = "stack"
+
+    def _apply(self):
+        tiles = len(self.op.input_arg_names)
+        data_x_lst = []
+        for index in range(tiles):
+            data_x_lst.append(
+                self._get_ge_input(self.op.input_arg_names[index]))
+        axis = self.op.attr("axis")
+
+        data_x = data_x_lst[0]
+        tensor = self._create_ge_tensor([1], 2, axis)
+        tensor_axis = core.GEOperatorFactory.create_operator(
+            "axis" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        expand = core.GEOperatorFactory.create_operator(
+            "expand" + self._accumulated_op_id(),
+            "ExpandDims").set_input("x", data_x).set_input("axis", tensor_axis)
+
+        stack = core.GEOperatorFactory.create_operator(
+            "stack" + self._accumulated_op_id(),
+            "TileWithAxis").set_input("x", expand).set_attr_int32(
+                "axis", axis).set_attr_int32("tiles", tiles)
+
+        return [stack], [[0]]
+
+
+class UnSqueezeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(UnSqueezeParser, self).__init__(graph, var2geop)
+        self.parser_name = "unsqueeze2"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr('axes')
+
+        output = core.GEOperatorFactory.create_operator(
+            "unsqueeze" + self._accumulated_op_id(),
+            "Unsqueeze").set_input("x", x).set_attr_vec_int32("axes", axes)
+        shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", output)
+        return [shape, output], [[1], [0]]
+
+
+## parallel
+class AllGatherParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AllGatherParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_allgather"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        rank_size = self.op.attr("rank_size")
+        group = self.op.attr("group")
+
+        allgather = core.GEOperatorFactory.create_operator(
+            "allgather" + self._accumulated_op_id(), "HcomAllGather").set_input(
+                "x", x).set_attr_int32(
+                    "rank_size", rank_size).set_attr_string("group", group)
+        return [allgather], [[0]]
+
+
+class AllReduceParser(AscendParserBase):
+    def __init__(self, graph, var2geop, reduction):
+        super(AllReduceParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_allreduce_" + reduction
+        self.reduction = reduction
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        reduction = self.reduction
+        ring_id = self.op.attr("ring_id")
+        group = "hcom_group_" + str(ring_id)
+        fusion = None  #self.op.attr("fusion")
+        fusion_id = None  #self.op.attr("fusion_id")
+
+        allreduce = core.GEOperatorFactory.create_operator(
+            "allreduce" + self._accumulated_op_id(), "HcomAllReduce").set_input(
+                "x", x).set_attr_string(
+                    "reduction", reduction).set_attr_string("group", group)
+        if fusion is not None:
+            allreduce.set_attr_int32("fusion", fusion)
+
+        if fusion_id is not None:
+            allreduce.set_attr_int32("fusion_id", fusion_id)
+        return [allreduce], [[0]]
+
+
+class AllReduceSumParser(AllReduceParser):
+    def __init__(self, graph, var2geop):
+        super(AllReduceSumParser, self).__init__(graph, var2geop, 'sum')
+
+
+class AllReduceMaxParser(AllReduceParser):
+    def __init__(self, graph, var2geop):
+        super(AllReduceMaxParser, self).__init__(graph, var2geop, 'max')
+
+
+class BroadcastParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(BroadcastParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_broadcast"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        root_rank = self.op.attr("root_rank")
+        group = self.op.attr("group")
+
+        broadcast = core.GEOperatorFactory.create_operator(
+            "broadcast" + self._accumulated_op_id(), "HcomBroadcast").set_input(
+                "x", x).set_attr_int32(
+                    "root_rank", root_rank).set_attr_string("group", group)
+        return [broadcast], [[0]]
+
+
+class ReduceScatterParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceScatterParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_reduce_scatter"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        reduction = self.op.attr("reduction")
+        group = self.op.attr("group")
+        rank_size = self.op.attr("rank_size")
+
+        reduce_scatter = core.GEOperatorFactory.create_operator(
+            "reducescatter" + self._accumulated_op_id(),
+            "HcomReduceScatter").set_input("x", x).set_attr_string(
+                "reduction", reduction).set_attr_string(
+                    "group", group).set_attr_int32("rank_size", rank_size)
+        return [reduce_scatter], [[0]]
+
+
+class SendParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SendParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_send"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sr_tag = self.op.attr("sr_tag")
+        dest_rank = self.op.attr("dest_rank")
+        group = self.op.attr("group")
+
+        send = core.GEOperatorFactory.create_operator(
+            "send" + self._accumulated_op_id(), "HcomSend").set_input(
+                "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32(
+                    "dest_rank", dest_rank).set_attr_string("group", group)
+        return [send], [[0]]
+
+
+class ReceiveParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReceiveParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_receive"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sr_tag = self.op.attr("sr_tag")
+        src_rank = self.op.attr("src_rank")
+        group = self.op.attr("group")
+        shape = self.op.attr("shape")
+        dtype = self.op.attr("dtype")
+
+        receive = core.GEOperatorFactory.create_operator(
+            "receive" + self._accumulated_op_id(), "HcomReceive").set_input(
+                "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32(
+                    "src_rank", src_rank).set_attr_string(
+                        "group", group).set_attr_vec_int32(
+                            "shape", shape).set_attr_int32("dtype", dtype)
+        return [receive], [[0]]
+
+
+class RangeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(RangeParser, self).__init__(graph, var2geop)
+        self.parser_name = "range"
+
+    def _apply(self):
+        # TODO not support range type yet
+        start = self._get_ge_input(self.op.input_arg_names[0])
+        end = self._get_ge_input(self.op.input_arg_names[1])
+        delta = self._get_ge_input(self.op.input_arg_names[2])
+
+        ge_range = core.GEOperatorFactory.create_operator(
+            "range" + self._accumulated_op_id(), "Range")\
+              .set_input("start", end)\
+              .set_input("limit", start) \
+              .set_input("delta", delta)
+
+        return [ge_range], [[0]]
+
+
+class UniformRandomParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(UniformRandomParser, self).__init__(graph, var2geop)
+        self.parser_name = "uniform_random"
+
+    def _apply(self):
+        shape = self.op.attr("shape")
+
+        min_v = self.op.attr("min")
+        max_v = self.op.attr("max")
+        seed = self.op.attr("seed")
+        dtype = self.op.attr("dtype")
+        assert max_v > min_v, "assert max_v > min_v, but recieved " + \
+               "as max_v={}, min_v={} ".format(max_v, min_v)
+
+        tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor1)
+
+        ge_ur = core.GEOperatorFactory.create_operator(
+            "uniform_random" + self._accumulated_op_id(), "RandomUniform")\
+            .set_input("shape", shape_tensor)\
+            .set_attr_dtype("dtype", self.ascend_helper.dtype2ge(dtype))  \
+            .set_attr_int32("seed", seed)\
+            .set_attr_int32("seed2", seed)
+
+        scale = max_v - min_v
+
+        scale_value = core.GEOperatorFactory.create_operator(
+            "scale" + self._accumulated_op_id(), "Power").set_input(
+                "x", ge_ur).set_attr_float("power", 1.0).set_attr_float(
+                    "scale", scale).set_attr_float("shift", min_v)
+
+        return [scale_value], [[0]]
+
+
+class EqualParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(EqualParser, self).__init__(graph, var2geop)
+        self.parser_name = "equal"
+
+    def _apply(self):
+        data_x1 = self._get_ge_input(self.op.input_arg_names[0])
+        data_x2 = self._get_ge_input(self.op.input_arg_names[1])
+        equal = core.GEOperatorFactory.create_operator("equal" \
+           + self._accumulated_op_id(), "Equal")\
+             .set_input("x1", data_x1)\
+             .set_input("x2", data_x2)
+        return [equal], [[0]]
+
+
+class ExpandParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ExpandParser, self).__init__(graph, var2geop)
+        self.parser_name = "expand"
+
+    def _apply(self):
+        data_x1_shape = self._get_ge_input(self.op.input_arg_names[0])
+        expand_times = self.op.attr('expand_times')
+
+        tensor = self._create_ge_tensor([len(expand_times)], 2, expand_times)
+        expand_tensor = core.GEOperatorFactory.\
+           create_operator("const" + self._accumulated_op_id(), "Const")\
+              .set_attr_tensor("value", tensor)
+
+        assign = core.GEOperatorFactory\
+           .create_operator("tile" + self._accumulated_op_id(), "Tile")\
+              .set_input("x", data_x1_shape)\
+              .set_input("multiples", expand_tensor)
+        return [assign], [[0]]
+
+
+class SqueezeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqueezeParser, self).__init__(graph, var2geop)
+        self.parser_name = "squeeze2"
+
+    def _apply(self):
+        tensor = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axes")
+
+        data_squeezed = core.GEOperatorFactory\
+           .create_operator("squeeze" + self._accumulated_op_id(), "Squeeze")\
+             .set_input("x", tensor)\
+             .set_attr_vec_int32("axes", axes)
+        shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Shape").set_input("x", data_squeezed)
+        return [shape, data_squeezed], [[1], [0]]
+
+
+#****************************************************************#
+#***************************            *************************#
+#***************************            *************************#
+#*************************** GradParser *************************#
+#***************************            *************************#
+#***************************            *************************#
+#****************************************************************#
+## grad
+class ReduceSumGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceSumGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "reduce_sum_grad"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        input = self._get_ge_input(self.op.input_arg_names[1])
+
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Shape").set_input("x", input, 0)
+        tensoron = self._create_ge_tensor([1], 2, -1)
+        const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
+        self._mark_as_input(const)
+
+        reduce_sum = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor)
+        #reduce_sum = core.GEOperatorFactory.create_operator("expand" + self._accumulated_op_id(), "ExpandDims").set_input("x", reduce_sum).set_input("axis", const)
+
+        return [reduce_sum], [[0]]
+
+
+class MatMulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MatMulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "matmul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[2])
+        transpose_x = self.op.attr("transpose_X")
+        transpose_y = self.op.attr("transpose_Y")
+
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        x_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        y_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if len(x_shape) > 2:
+            if transpose_y:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", False)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", x).set_attr_bool(
+                            "adj_x1", True).set_attr_bool("adj_x2", False)
+            else:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "adj_x1", True).set_attr_bool("adj_x2", False)
+        else:
+            if transpose_y:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 False)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", x).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+            else:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class MulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "mul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[2])
+        x_num_col_dims = self.op.attr("x_num_col_dims")
+        y_num_col_dims = self.op.attr("y_num_col_dims")
+
+        shape_out_grad = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_x = self.op.block.var(self.op.input_arg_names[1]).shape
+        shape_y = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if x_num_col_dims == 1 and y_num_col_dims == 1:
+            if len(shape_x) == 2 and len(shape_y) == 2:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+            elif len(shape_x) == 3 and len(shape_y) == 2:
+                flatten_x = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "Flatten").set_input("x", x)
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input(
+                        "x1", out_grad).set_input("x2", y).set_attr_bool(
+                            "transpose_x1",
+                            False).set_attr_bool("transpose_x2", True)
+                if len(shape_out_grad) == 2:
+                    x_grad = core.GEOperatorFactory.create_operator(
+                        "unsqueeze" + self._accumulated_op_id(),
+                        "Unsqueeze").set_input("x", x_grad).set_attr_vec_int32(
+                            "axes", [1])
+
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input(
+                        "x1",
+                        flatten_x).set_input("x2", out_grad).set_attr_bool(
+                            "transpose_x1",
+                            True).set_attr_bool("transpose_x2", False)
+        else:
+            if len(shape_x) == 3 and len(shape_y) == 2:
+                assert x_num_col_dims == 2, "only support 2"
+                flatten_x = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", x).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+                flatten_out_grad = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", out_grad).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+
+                y_unsqueeze = core.GEOperatorFactory.create_operator(
+                    "unsqueeze" + self._accumulated_op_id(),
+                    "Unsqueeze").set_input("x",
+                                           y).set_attr_vec_int32("axes", [0])
+                y_stack = core.GEOperatorFactory.create_operator(
+                    "stack" + self._accumulated_op_id(),
+                    "TileWithAxis").set_input("x", y_unsqueeze).set_attr_int32(
+                        "axis", 0).set_attr_int32("tiles", shape_out_grad[0])
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y_stack).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", flatten_x).set_input(
+                        "x2", flatten_out_grad).set_attr_bool(
+                            "transpose_x1",
+                            True).set_attr_bool("transpose_x2", False)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class ReluGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReluGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "relu_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        relu_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input(
+                "gradients", out_grad).set_input("features", out)
+        return [relu_grad], [[0]]
+
+
+class SoftmaxWithCrossEntropyGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax_with_cross_entropy_grad"
+
+    def _apply(self):
+        label = self._get_ge_input(self.op.input_arg_names[0])
+        loss_grad = self._get_ge_input(self.op.input_arg_names[1])
+        softmax = self._get_ge_input(self.op.input_arg_names[2])
+        cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1]
+
+        label_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        loss_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        softmax_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        tensoron = self._create_ge_tensor([1], 5, 1)
+        on = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
+        tensoroff = self._create_ge_tensor([1], 5, 0)
+        off = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoroff)
+        self._mark_as_input(on)
+        self._mark_as_input(off)
+
+        label = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", label).set_attr_int32("dst_type", 3)
+        onehot = core.GEOperatorFactory.create_operator(
+            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
+                "x", label).set_input("on_value", on).set_input(
+                    "off_value", off).set_attr_int32("depth", cls_num)
+        squeeze = core.GEOperatorFactory.create_operator(
+            "suqeeze" + self._accumulated_op_id(),
+            "Squeeze").set_input("x", onehot)
+        sub = core.GEOperatorFactory.create_operator(
+            "sub" + self._accumulated_op_id(), "Sub").set_input(
+                "x1", softmax).set_input("x2", squeeze)
+        grad = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", loss_grad).set_input("x2", sub)
+
+        return [on, off, label, onehot, grad], [[-1]]
+
+
+class DotMulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotMulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_mul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        out_1 = self._get_ge_input(self.op.input_arg_names[1])
+        out_2 = self._get_ge_input(self.op.input_arg_names[2])
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", out_grad).set_input("x2", out_2)
+        y_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", out_1).set_input("x2", out_grad)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class DotAddGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotAddGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_add_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        out_1 = self._get_ge_input(self.op.input_arg_names[1])
+        out_2 = self._get_ge_input(self.op.input_arg_names[2])
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        out_1_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        out_2_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        x_grad = out_grad
+        cur_time_x = len(out_grad_shape) - len(out_1_shape)
+        for i in range(cur_time_x):
+            x_grad = core.GEOperatorFactory.create_operator(
+                self.parser_name + self._accumulated_op_id(),
+                "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32(
+                    "axes", [0]).set_attr_bool("keep_dims", False)
+        for axis, size in enumerate(out_1_shape):
+            if size == 1:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32(
+                        "axes", [axis]).set_attr_bool("keep_dims", True)
+
+        y_grad = out_grad
+        cur_time_y = len(out_grad_shape) - len(out_2_shape)
+        for i in range(cur_time_y):
+            y_grad = core.GEOperatorFactory.create_operator(
+                self.parser_name + self._accumulated_op_id(),
+                "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32(
+                    "axes", [0]).set_attr_bool("keep_dims", False)
+        for axis, size in enumerate(out_2_shape):
+            if size == 1:
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32(
+                        "axes", [axis]).set_attr_bool("keep_dims", True)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class DotDivGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotDivGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_div_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        x = self._get_ge_input(self.op.input_arg_names[2])
+        y = self._get_ge_input(self.op.input_arg_names[3])
+
+        y_power = core.GEOperatorFactory.create_operator(
+            "power" + self._accumulated_op_id(), "Power").set_input(
+                "x", y).set_attr_float("power", -1)
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", x)
+        x_zero = core.GEOperatorFactory.create_operator(
+            "equal" + self._accumulated_op_id(), "Equal").set_input(
+                "x1", x).set_input("x2", tensor_zeros)
+        x_nozero = core.GEOperatorFactory.create_operator(
+            "logical_not" + self._accumulated_op_id(),
+            "LogicalNot").set_input("x", x_zero)
+        x_nozero_f = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_nozero).set_attr_int32("dst_type", 0)
+        x_grad_w = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", x_nozero_f).set_input("x2", y_power)
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", x_grad_w).set_input("x2", out_grad)
+
+        y_grad_w = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", out).set_input("x2", y_power)
+        y_grad = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", y_grad_w).set_input("x2", out_grad)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class SoftmaxGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SoftmaxGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "SoftmaxGrad").set_input("softmax", out).set_input("grad_softmax",
+                                                               out_grad)
+        return [x_grad], [[0]]
+
+
+class ReshapeGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReshapeGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "reshape2_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x_shape = self._get_ge_input(self.op.input_arg_names[1])
+        x_shape_list = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if x_shape_list[0] == 0:
+            x_shape_delzero = x_shape_list[1:]
+        tensor = self._create_ge_tensor([len(x_shape_delzero)], 2,
+                                        x_shape_delzero)
+        const_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        x_grad = core.GEOperatorFactory.create_operator(
+            "reshape" + self._accumulated_op_id(), "Reshape").set_input(
+                "x", out_grad).set_input("shape", const_shape)
+
+        return [x_grad], [[0]]
 
-        return [reshape, reshape], [[0], [1]]
+
+class GatherGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(GatherGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "gather_grad"
+
+    def _apply(self):
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        x = self._get_ge_input(self.op.input_arg_names[2])
+
+        index_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        x_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if len(index_shape) == 1:
+            index = core.GEOperatorFactory.create_operator(
+                "unsqueeze" + self._accumulated_op_id(), "Unsqueeze").set_input(
+                    "x", index).set_attr_vec_int32("axes", [1])
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", x)
+        x_grad = core.GEOperatorFactory.create_operator(
+            "scatter" + self._accumulated_op_id(),
+            "TensorScatterUpdate").set_input("x", tensor_zeros).set_input(
+                "indices", index).set_input("updates", out_grad)
+
+        return [tensor_zeros, x_grad], [[-1]]
+
+
+class TransposeGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TransposeGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "transpose2_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        perm = self.op.attr("axis")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[1]).shape[1:]
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        assert list(map(lambda x: out_grad_shape[x], perm)) == list(x_shape)
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            "transpose" + self._accumulated_op_id(), "TransposeD").set_input(
+                "x", out_grad).set_attr_vec_int32("perm", perm)
+
+        return [x_grad], [[0]]
+
+
+class LayerNormGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LayerNormGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "layer_norm_grad"
+
+    def _apply(self):
+        bias = self._get_ge_input(self.op.input_arg_names[0])
+        mean = self._get_ge_input(self.op.input_arg_names[1])
+        scale = self._get_ge_input(self.op.input_arg_names[2])
+        variance = self._get_ge_input(self.op.input_arg_names[3])
+        x = self._get_ge_input(self.op.input_arg_names[4])
+        out_grad = self._get_ge_input(self.op.input_arg_names[5])
+        x_dtype = self.op.block.var(self.op.input_arg_names[4]).dtype
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "LayerNormGrad").set_input("dy", out_grad).set_input(
+                "x", x).set_input("variance", variance).set_input(
+                    "mean", mean).set_input("gamma", scale)
+
+        cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str(
+            x_dtype)] == 0 else 1
+        out_x_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 0).set_attr_int32("dst_type", cast_dtype)
+        out_scale_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 1).set_attr_int32("dst_type", cast_dtype)
+        out_bias_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 2).set_attr_int32("dst_type", cast_dtype)
+
+        return [out_x_grad, out_scale_grad, out_bias_grad], [[2], [1], [0]]
+
+
+class TanhGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TanhGradParser, self).__init__(graph, var2geop)
+        self.parser_name = 'tanh_grad'
+
+    def _apply(self):
+        y = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        tanh_grad = core.GEOperatorFactory.create_operator(
+            "tanh_grad" + self._accumulated_op_id(),
+            "TanhGrad").set_input("y", y).set_input("dy", out_grad)
+
+        return [tanh_grad], [[0]]
+
+
+class LogGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogGradParser, self).__init__(graph, var2geop)
+        self.parser_name = 'log_grad'
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        input = self._get_ge_input(self.op.input_arg_names[1])
+        log_grad = core.GEOperatorFactory.create_operator(
+            "log_grad" + self._accumulated_op_id(),
+            "DivNoNan").set_input("x1", grad).set_input("x2", input)
+        return [log_grad], [[0]]
+
+
+class SqrtGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqrtGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "sqrt_grad"
+
+    def _apply(self):
+        y = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        sqrt_grad = core.GEOperatorFactory.create_operator(
+            "sqrt_grad" + self._accumulated_op_id(),
+            "SqrtGrad").set_input("y", y).set_input("dy", out_grad)
+        return [sqrt_grad]
+
+
+class PowGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(PowGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "pow_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        factor = self.op.attr("factor")
+
+        shape_tensor = self._create_shape_tensor()
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+        factor_scale = self._create_ge_tensor([1], 5, factor)
+        factor_scale = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", factor_scale)
+        factor_tensor = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input(
+                "x", factor_scale).set_input("shape", shape_tensor)
+
+        x_power = core.GEOperatorFactory.create_operator(
+            "x_power" + self._accumulated_op_id(), "Power").set_input(
+                "x", x).set_attr_float("power", factor - 1)
+        x_power_mul_factor = core.GEOperatorFactory.create_operator(
+            "x_power_mul_factor" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", x).set_input("x2", factor_tensor)
+        x_power_mul_factor_grad = core.GEOperatorFactory.create_operator(
+            "x_power_mul_factor_grad" + self._accumulated_op_id(),
+            "Mul").set_input("x1", x_power_mul_factor).set_input("x2", grad)
+
+        return [x_power_mul_factor_grad], [[0]]
+
+
+class GeluGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(GeluGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "gelu_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+
+        y = core.GEOperatorFactory.create_operator(
+            "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x)
+        gelu_grad = core.GEOperatorFactory.create_operator(
+            "gelu_grad" + self._accumulated_op_id(), "GeluGrad").set_input(
+                "x", x).set_input("dy", grad).set_input("y", y)
+
+        return [gelu_grad], [[0]]
+
+
+class MeanGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MeanGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "mean_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "one_tensor" + self._accumulated_op_id(),
+            "OnesLike").set_input("x", x)
+        sum = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", ones_tensor).set_attr_bool(
+                    "keep_dims", False).set_attr_vec_int32("axes", [])
+        mean = core.GEOperatorFactory.create_operator(
+            "x_power" + self._accumulated_op_id(), "Power").set_input(
+                "x", sum).set_attr_float("power", -1)
+
+        mean_grad = core.GEOperatorFactory.create_operator(
+            "mean_grad" + self._accumulated_op_id(),
+            "Mul").set_input("x1", mean).set_input("x2", grad)
+
+        return [mean_grad], [[0]]
+
+
+class SliceGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SliceGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "slice_grad"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        grad = self._get_ge_input(self.op.input_arg_names[1])
+        axes = self.op.attr("axes")
+        starts = self.op.attr("starts")
+        ends = self.op.attr("ends")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        len_shape = len(x_shape)
+        axes_cor = list(range(len_shape))
+        starts_cor, ends_cor = [], []
+        cnt = 0
+        for i in range(len_shape):
+            starts_cor.append(starts[cnt] if i in axes else 0)
+            if i in axes and ends[cnt] <= x_shape[i]:
+                ends_cor.append(x_shape[i] - ends[cnt])
+            else:
+                ends_cor.append(0)
+            if i in axes:
+                cnt += 1
+
+        starts_cor[0] = 0
+        ends_cor[0] = 0
+        paddings = [[s, e] for (s, e) in zip(starts_cor, ends_cor)]
+        slice_value = core.GEOperatorFactory.create_operator(
+            "slice_grad" + self._accumulated_op_id(), "PadD").set_input(
+                "x", grad).set_attr_vec_vec_int64("paddings", paddings)
+
+        return [slice_value], [[0]]
+
+
+class LookUpTableGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LookUpTableGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "lookup_table_grad"
+
+    def _apply(self):
+        ids = self._get_ge_input(self.op.input_arg_names[0])
+        grad = self._get_ge_input(self.op.input_arg_names[1])
+        embedding = self._get_ge_input(self.op.input_arg_names[2])
+
+        shape_ids = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_grad = self.op.block.var(self.op.input_arg_names[1]).shape
+        shape_embedding = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        ids_flatten = core.GEOperatorFactory.create_operator(
+            "flatten" + self._accumulated_op_id(), "FlattenV2").set_input(
+                "x",
+                ids).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1)
+        grad_flatten = core.GEOperatorFactory.create_operator(
+            "flatten" + self._accumulated_op_id(), "FlattenV2").set_input(
+                "x",
+                grad).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1)
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", embedding)
+        embedding_grad = core.GEOperatorFactory.create_operator(
+            "scatteradd" + self._accumulated_op_id(),
+            "TensorScatterAdd").set_input(
+                "x", tensor_zeros).set_input("indices", ids_flatten).set_input(
+                    "updates", grad_flatten)
+
+        return [embedding_grad], [[0]]
+
+
+class SGDParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SGDParser, self).__init__(graph, var2geop)
+        self.parser_name = "sgd"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        lr = self._get_ge_input(self.op.input_arg_names[1])
+        param = self._get_ge_input(self.op.input_arg_names[2])
+        sgd = core.GEOperatorFactory.create_operator(
+            "momentum" + self._accumulated_op_id(),
+            "ApplyGradientDescent").set_input("var", param).set_input(
+                "alpha", lr).set_input("delta", grad)
+        return [sgd], [[0]]
+
+
+class AdamParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AdamParser, self).__init__(graph, var2geop)
+        self.parser_name = "adam"
+
+    def _apply(self):
+        beta1_power = self._get_ge_input(self.op.input_arg_names[0])
+        beta2_power = self._get_ge_input(self.op.input_arg_names[1])
+        grad = self._get_ge_input(self.op.input_arg_names[2])
+        lr = self._get_ge_input(self.op.input_arg_names[3])
+        moment1 = self._get_ge_input(self.op.input_arg_names[4])
+        moment2 = self._get_ge_input(self.op.input_arg_names[5])
+        param = self._get_ge_input(self.op.input_arg_names[6])
+        beta1 = self.op.attr('beta1')
+        beta2 = self.op.attr('beta2')
+        epsilon = self.op.attr('epsilon')
+
+        beta1 = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, beta1))
+        beta2 = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, beta2))
+        epsilon = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, epsilon))
+
+        adam = core.GEOperatorFactory.create_operator(
+            "adam" + self._accumulated_op_id(),
+            "ApplyAdam").set_input("var", param).set_input(
+                "m", moment1).set_input("v", moment2).set_input(
+                    "beta1_power", beta1_power).set_input(
+                        "beta2_power", beta2_power).set_input(
+                            "lr", lr).set_input("beta1", beta1).set_input(
+                                "beta2", beta2).set_input(
+                                    "epsilon", epsilon).set_input("grad", grad)
+
+        return [adam], [[0]]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index c3d27bcc4ea551..a7f938647ad719 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -106,6 +106,11 @@ def _add_sync_by_allreduce(block):
                     'use_calc_stream': True,
                     OP_ROLE_KEY: OpRole.Forward
                 })
+            block.append_op(
+                type='c_sync_calc_stream',
+                inputs={'X': sync_var},
+                outputs={'Out': sync_var},
+                attrs={OP_ROLE_KEY: OpRole.Forward})
 
         block = program.global_block()
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 159c0b973b2b72..9a4ffd2fd02d4a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -61,8 +61,9 @@ def _setup_nccl_op(self, startup_program, main_program, build_strategy):
         trainer_endpoints_env = ",".join(trainer_endpoints)
         trainers_num = self.role_maker._worker_num()
 
-        if trainer_id == 0:
-            wait_server_ready(other_trainers)
+        # FIXME(wangxi): approve this.
+        #if trainer_id == 0:
+        #    wait_server_ready(other_trainers)
 
         if core.is_compiled_with_cuda():
             comm_id_var = startup_program.global_block().create_var(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index dd13f9bc5d4e75..f6d2af0b416d2d 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -38,6 +38,23 @@ def _can_apply(self):
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         return True if k_steps >= 0 else False
 
+    def get_dist_env(self):
+        trainer_id = int(os.getenv('PADDLE_TRAINER_ID', '0'))
+        trainer_endpoints = ''
+        current_endpoint = ''
+        num_trainers = 0
+        if os.getenv('PADDLE_TRAINER_ENDPOINTS'):
+            trainer_endpoints = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+            current_endpoint = trainer_endpoints.split(',')[trainer_id]
+            num_trainers = len(trainer_endpoints.split(','))
+
+        return {
+            'trainer_id': trainer_id,
+            'num_trainers': num_trainers,
+            'current_endpoint': current_endpoint,
+            'trainer_endpoints': trainer_endpoints
+        }
+
     def _get_distributed_strategy(self):
         from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
@@ -64,6 +81,8 @@ def _build_trainer_programs(self, compiled_config):
         _main = compiled_config.origin_main_program.clone()
         _startup = compiled_config.origin_startup_program.clone()
 
+        use_ps_gpu = self.user_defined_strategy.a_sync_configs["use_ps_gpu"]
+
         if not compiled_config.is_geo_mode():
             from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass
             _add_lr_decay_table_pass(
@@ -71,14 +90,28 @@ def _build_trainer_programs(self, compiled_config):
                 self.user_defined_strategy.a_sync_configs["lr_decay_steps"])
 
             # for main program
-            _main = worker.delete_optimizer_pass(_main, compiled_config)
-            _main = worker.distributed_ops_pass(_main, compiled_config)
-            _main = worker.append_send_ops_pass(_main, compiled_config)
-
-            # for startup program
+            _main = worker.distributed_ops_pass(_main, compiled_config,
+                                                use_ps_gpu)
+            if not use_ps_gpu:
+                _main = worker.delete_optimizer_pass(_main, compiled_config)
+                _main = worker.append_send_ops_pass(_main, compiled_config)
+                _startup = worker.delet_extra_optimizes_pass(_startup,
+                                                             compiled_config)
+
+                # for startup program
             _startup = worker.fake_init_ops_pass(_startup, compiled_config)
-            _startup = worker.delet_extra_optimizes_pass(_startup,
-                                                         compiled_config)
+            if use_ps_gpu:
+                _main = worker.ps_gpu_pass(_main)
+                from paddle.fluid.transpiler.collective import SingleProcessMultiThread
+                t = SingleProcessMultiThread()
+                env = self.get_dist_env()
+                t.transpile(
+                    startup_program=_startup,
+                    main_program=_main,
+                    rank=env["trainer_id"],
+                    endpoints=env["trainer_endpoints"],
+                    current_endpoint=env['current_endpoint'],
+                    wait_port=False)
 
             compiled_config.set_origin_ps_main_program(_main)
             compiled_config.set_origin_ps_startup_program(_startup)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
old mode 100644
new mode 100755
index 6f435bb86ba5ac..ae2daa9b9d8592
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -45,11 +45,16 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
             'accumulate_steps']
         self.schedule_mode = user_defined_strategy.pipeline_configs[
             'schedule_mode']
+        self.use_sharding = user_defined_strategy.sharding
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
             return False
 
+        # FIXME revise for hybrid parallelism
+        if self.use_sharding:
+            return False
+
         if self.user_defined_strategy.pipeline == True:
             return True
         return False
@@ -171,6 +176,7 @@ def minimize_impl(self,
         program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id
         program._pipeline_opt['micro_batch_size'] = self.micro_batch_size
         program._pipeline_opt['schedule_mode'] = self.schedule_mode
+        program._pipeline_opt['use_sharding'] = False
         optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize(
             loss, startup_program, parameter_list, no_grad_set)
         self.startup_program = orig_startup_program._pipeline_opt[
@@ -218,7 +224,6 @@ def _insert_allreduce_ops(self, ring_id):
         grad = None
         processed_param_name = set()
         first_optimize_op_idx = None
-        add_sync_calc_stream = False
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_backward_op(op) and not first_optimize_op_idx:
                 first_optimize_op_idx = idx + 1
@@ -242,15 +247,6 @@ def _insert_allreduce_ops(self, ring_id):
                     origin_param = origin_block.vars[op_role_var[i]]
                     if origin_param.is_distributed:
                         continue
-                    if not add_sync_calc_stream:
-                        add_sync_calc_stream = True
-                        block._insert_op(
-                            first_optimize_op_idx + offset,
-                            type='c_sync_calc_stream',
-                            inputs={'X': grad},
-                            outputs={'Out': grad},
-                            attrs={OP_ROLE_KEY: OpRole.Optimize})
-                        offset += 1
 
                     block._insert_op(
                         first_optimize_op_idx + offset,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
old mode 100644
new mode 100755
index 03b36262a4fb1e..40ba77815663f0
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -73,7 +73,7 @@ def remove_cast_op(block, params, segment, offset):
     @staticmethod
     def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
         """
-        1. prune all cast_fp32_to_fp16 ops if the param not belongs to this shard
+        1. prune all cast_fp16_to_fp32 ops if the param not belongs to this shard
         2. revise amp inifine grad checking for sharding
         """
         # remove cast
@@ -81,7 +81,10 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
             if not FP16Utils.is_fp32_cast_op(block, op):
                 continue
             output_name = op.desc.output_arg_names()[0]
-            param_name = output_name.strip("@GRAD")
+            # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+            param_name = output_name.strip(
+                "@GRAD@MERGED"
+            ) if "@MERGED" in output_name else output_name.strip("@GRAD")
             if param_name not in shard.global_params:
                 raise ValueError("Output 'X' of cast_op must be a grad of"
                                  "model param, but {} is not a grad".format(
@@ -103,20 +106,37 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
                 op._rename_input(inf_var_name, inf_var_name + "@sharding")
             if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
                 reversed_x = []
+                reversed_x_paramname = []
                 for input_name in op.desc.input('X'):
-                    param_name = input_name.strip("@GRAD")
+                    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+                    if "@MERGED" in input_name:
+                        param_name = input_name.strip("@GRAD@MERGED")
+                    else:
+                        param_name = input_name.strip("@GRAD")
                     if param_name not in shard.global_params:
                         raise ValueError(
                             "Input 'X' of check_finite_and_unscale must"
                             "be grads, but {} is not a grad".format(input_name))
                     if shard.has_param(param_name):
                         reversed_x.append(input_name)
+                        reversed_x_paramname.append(param_name)
                 op.desc.set_input('X', reversed_x)
                 op.desc.set_output('Out', reversed_x)
+
+                # the grad checking should take the all and only param in the current shard
+                to_check_param = set(reversed_x_paramname)
+                should_check_param = set(shard.global_params).intersection(
+                    set([param for param, worker_idx in shard.global_param2device.items() \
+                        if worker_idx == shard.worker_idx]))
+                assert to_check_param == should_check_param, "amp \
+                    check_finite_and_unscale checking miss [{}] and got unexpected [{}]".format(
+                    should_check_param - to_check_param,
+                    to_check_param - should_check_param)
+
         if update_loss_scaling_op_idx == -1:
             return
         inf_var = block.var(inf_var_name)
-        inf_var_fp32 = block.create_var(
+        inf_var_int32 = block.create_var(
             name=inf_var_name + "@cast_int32",
             shape=inf_var.shape,
             dtype=core.VarDesc.VarType.INT32)
@@ -128,33 +148,86 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
             update_loss_scaling_op_idx,
             type='cast',
             inputs={'X': inf_var},
-            outputs={'Out': inf_var_fp32},
+            outputs={'Out': inf_var_int32},
             attrs={
                 "in_dtype": inf_var.dtype,
-                "out_dtype": inf_var_fp32.dtype,
+                "out_dtype": inf_var_int32.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
-        insert_sync_calc_op(block, update_loss_scaling_op_idx + 1,
-                            [inf_var_fp32])
+        # this allreduce communication should not overlap with calc
         block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 2,
+            update_loss_scaling_op_idx + 1,
             type='c_allreduce_max',
-            inputs={'X': inf_var_fp32},
-            outputs={'Out': inf_var_fp32},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Optimize})
-
-        comm_op_num = insert_sync_comm_op(block, update_loss_scaling_op_idx + 3,
-                                          ring_id, [inf_var_fp32])
-
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
         block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 3 + comm_op_num,
+            update_loss_scaling_op_idx + 2,
             type='cast',
-            inputs={'X': inf_var_fp32},
+            inputs={'X': inf_var_int32},
             outputs={'Out': inf_var_sharding},
             attrs={
-                "in_dtype": inf_var_fp32.dtype,
+                "in_dtype": inf_var_int32.dtype,
                 "out_dtype": inf_var_sharding.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
         block._sync_with_cpp()
+
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    @staticmethod
+    def sync_amp_check_nan_inf(block, ring_id):
+        update_loss_scaling_op_idx = -1
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type == "update_loss_scaling":
+                update_loss_scaling_op_idx = idx
+                inf_var_name = op.desc.input('FoundInfinite')[0]
+                op._rename_input(inf_var_name, inf_var_name + "@GLOBAL_WORLD")
+
+        # not use amp
+        if update_loss_scaling_op_idx == -1:
+            return
+        inf_var = block.var(inf_var_name)
+        inf_var_int32 = block.create_var(
+            name=inf_var_name + "@cast_int32",
+            shape=inf_var.shape,
+            dtype=core.VarDesc.VarType.INT32)
+        inf_var_global = block.create_var(
+            name=inf_var_name + "@GLOBAL_WORLD",
+            shape=inf_var.shape,
+            dtype=inf_var.dtype)
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx,
+            type='cast',
+            inputs={'X': inf_var},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                "in_dtype": inf_var.dtype,
+                "out_dtype": inf_var_int32.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx + 1,
+            type='c_allreduce_max',
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx + 2,
+            type='cast',
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_global},
+            attrs={
+                "in_dtype": inf_var_int32.dtype,
+                "out_dtype": inf_var_global.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
old mode 100644
new mode 100755
index c6aee792fcf745..d5a012b147a99e
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
@@ -16,14 +16,14 @@
 
 
 class GradientClipHelper(object):
-    def __init__(self, sharding_ring_id):
-        self.sharding_ring_id = sharding_ring_id
+    def __init__(self, mp_ring_id):
+        self.mp_ring_id = mp_ring_id
 
     def _is_gradient_clip_op(self, op):
         return op.desc.has_attr("op_namescope") \
             and op.desc.attr("op_namescope").startswith("/gradient_clip")
 
-    def prune_gradient_clip(self, block, shard):
+    def prune_gradient_clip(self, block, shard, pure_dp_degree=1):
         """
         prune gradient_clip related ops for params that not belong to cur shard
         prune: square, reduce_sum, elementwise_mul
@@ -31,6 +31,8 @@ def prune_gradient_clip(self, block, shard):
         """
         deperated_vars = set()
         deperate_op_idx = set()
+        reversed_x_paramname = []
+        global_norm_sum_op_idx = -1
         for idx, op in enumerate(block.ops):
             if not self._is_gradient_clip_op(op):
                 continue
@@ -40,15 +42,22 @@ def prune_gradient_clip(self, block, shard):
             for input_name in op.desc.input_arg_names():
                 if input_name in deperated_vars:
                     deperate_op = True
-                param_name = input_name.strip("@GRAD")
+                # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+                if "@MERGED" in input_name:
+                    param_name = input_name.strip("@GRAD@MERGED")
+                else:
+                    param_name = input_name.strip("@GRAD")
                 if shard.is_param(param_name) and \
                   not shard.has_param(param_name):
                     deperate_op = True
+                elif shard.is_param(param_name):
+                    reversed_x_paramname.append(param_name)
 
             if deperate_op:
                 deperate_op_idx.add(idx)
                 for output_name in op.desc.output_arg_names():
-                    deperated_vars.add(output_name)
+                    if output_name not in op.desc.input_arg_names():
+                        deperated_vars.add(output_name)
 
         if not deperated_vars:
             # got no gradient_clip op
@@ -62,36 +71,96 @@ def prune_gradient_clip(self, block, shard):
                 continue
             reversed_inputs = []
             if op.type == "sum":
+                global_norm_sum_op_idx = idx
                 for input_name in op.desc.input_arg_names():
                     if input_name not in deperated_vars:
                         reversed_inputs.append(input_name)
+
                 op.desc.set_input("X", reversed_inputs)
                 assert (len(op.desc.output_arg_names()) == 1)
                 sum_res = op.desc.output_arg_names()[0]
-                block._insert_op_without_sync(
-                    idx + 1,
-                    type='c_sync_comm_stream',
-                    inputs={'X': sum_res},
-                    outputs={'Out': sum_res},
-                    attrs={'ring_id': 0,
-                           OP_ROLE_KEY: OpRole.Optimize})
+
+                # this allreduce should not overlap with calc and should be scheduled in calc stream
                 block._insert_op_without_sync(
                     idx + 1,
                     type='c_allreduce_sum',
                     inputs={'X': sum_res},
                     outputs={'Out': sum_res},
                     attrs={
-                        'ring_id': self.sharding_ring_id,
-                        OP_ROLE_KEY: OpRole.Optimize
+                        'ring_id': self.mp_ring_id,
+                        'op_namescope': "/gradient_clip_model_parallelism",
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize,
                     })
+
+                # global norm should only be sum within each model parallelism word size when use global group
+                if pure_dp_degree > 1:
+                    block._insert_op_without_sync(
+                        idx + 2,
+                        type='scale',
+                        inputs={'X': sum_res},
+                        outputs={'Out': sum_res},
+                        attrs={
+                            'scale': 1.0 / float(pure_dp_degree),
+                            'op_namescope': "/gradient_clip_model_parallelism",
+                            'bias': 0.0,
+                            'bias_after_scale': False,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+        # the grad sum here should take the all and only param in the current shard
+        to_check_param = set(reversed_x_paramname)
+        should_check_param = set(shard.global_params).intersection(set(
+            [param for param, worker_idx in shard.global_param2device.items() \
+                if worker_idx == shard.worker_idx]))
+        assert to_check_param == should_check_param, "amp check_finite_and_unscale \
+        checking miss [{}] and got unexpected [{}]".format(
+            should_check_param - to_check_param,
+            to_check_param - should_check_param)
+
+        for var_name in deperated_vars:
+            block._remove_var(var_name, sync=False)
+        block._sync_with_cpp()
+        return
+
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    def sync_global_norm(self, block, ring_id, pure_dp_degree=1):
+        """
+        prune gradient_clip related ops for params that not belong to cur shard
+        prune: square, reduce_sum, elementwise_mul
+        keep: sum, sqrt, elementwise_max, elementwise_div
+        """
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not self._is_gradient_clip_op(op):
+                continue
+
+            if op.type == "sum":
+                sum_res = op.desc.output_arg_names()[0]
                 block._insert_op_without_sync(
                     idx + 1,
-                    type='c_sync_calc_stream',
+                    type='c_allreduce_sum',
                     inputs={'X': sum_res},
                     outputs={'Out': sum_res},
-                    attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    attrs={
+                        'ring_id': ring_id,
+                        'op_namescope': "/gradient_clip_model_parallelism",
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize,
+                    })
+
+                # global norm should only be sum within each model parallelism word size
+                if pure_dp_degree > 1:
+                    block._insert_op_without_sync(
+                        idx + 2,
+                        type='scale',
+                        inputs={'X': sum_res},
+                        outputs={'Out': sum_res},
+                        attrs={
+                            'scale': 1.0 / float(pure_dp_degree),
+                            'op_namescope': "/gradient_clip_model_parallelism",
+                            'bias': 0.0,
+                            'bias_after_scale': False,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
 
-        for var_name in deperated_vars:
-            block._remove_var(var_name, sync=False)
-        block._sync_with_cpp()
         return
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
new file mode 100755
index 00000000000000..76803818453c92
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -0,0 +1,281 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole
+from paddle.fluid import core, unique_name
+
+
+class OffloadHelper(object):
+    cpu_place_type = 0
+    cuda_place_type = 1
+    cuda_pinned_place_type = 2
+
+    def __init__(self):
+        pass
+        "0: dst is on CPUPlace. "
+        "1: dst is on CUDAPlace. "
+        "2: dst is on CUDAPinnedPlace. "
+
+    def _insert_cast_op(self, block, idx, src_name, dst_name):
+        src_var = block.var(src_name)
+        if not block.has_var(dst_name):
+            block.create_var(
+                name=dst_name,
+                shape=src_var.shape,
+                dtype=core.VarDesc.VarType.FP16,
+                persistable=True)
+        dst_var = block.var(dst_name)
+        assert dst_var.dtype == core.VarDesc.VarType.FP16
+        block._insert_op_without_sync(
+            idx,
+            type='cast',
+            inputs={'X': src_var},
+            outputs={'Out': dst_var},
+            attrs={
+                'in_dtype': src_var.dtype,
+                'out_dtype': dst_var.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+
+    def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type):
+        src_var = block.var(src_name)
+        dst_var = block.var(dst_name)
+        block._insert_op_without_sync(
+            idx,
+            type='memcpy',
+            inputs={'X': src_var},
+            outputs={'Out': dst_var},
+            attrs={
+                'dst_place_type': dst_place_type,
+                OP_ROLE_KEY: OpRole.Optimize,
+            })
+
+    def _insert_fetch_op(self, block, idx, src_name, dst_name):
+        self._insert_memcpy_op(block, idx, src_name, dst_name,
+                               OffloadHelper.cuda_place_type)
+
+    def _insert_offload_op(self, block, idx, src_name, dst_name):
+        self._insert_memcpy_op(block, idx, src_name, dst_name,
+                               OffloadHelper.cuda_pinned_place_type)
+
+    def _get_offload_var_name(self, name):
+        return unique_name.generate(name + '@offload')
+
+    def _create_offload_var(self, var_name, offload_var_name, blocks):
+        for block in blocks:
+            var = block.var(var_name)
+            var.persistable = False
+            offload_var = block.create_var(
+                name=offload_var_name,
+                shape=var.shape,
+                dtype=var.dtype,
+                persistable=True)
+
+    def offload_fp32param(self, block, startup_block):
+        """
+        (p_fp16) = cast(p)
+        (p_fp16_recompute) = cast(p)
+        (pout,) = adam(p)
+        ===========================>
+        rename(p_fp16_recompute, p_fp16)
+
+        (p,) = prefetch(p@offload)
+        (pout,) = adam(p)
+        (p_fp16) = cast(p)
+        (p@offload) = memcpy(p)
+        """
+        param_to_idx = dict()
+        param_to_fp16 = dict()
+        # recompute_var which need rename to fp16_param
+        fp16_param_to_recompute = dict()
+        recompute_to_fp16 = dict()
+
+        def remove_param(input_name):
+            param_to_idx.pop(input_name)
+            if input_name in param_to_fp16:
+                fp16_param = param_to_fp16.pop(input_name)
+                if fp16_param in fp16_param_to_recompute:
+                    recompute = fp16_param_to_recompute.pop(fp16_param)
+                    recompute_to_fp16.pop(recompute)
+
+        # step1: record param
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type in ('adam', 'momentum', 'lars', 'lamb'):
+                param = op.desc.input("Param")[0]
+                param_to_idx[param] = idx
+
+        # step2: remove param which can't offload
+        for idx, op in enumerate(block.ops):
+            if is_optimizer_op(op):
+                break
+            for input_name in op.desc.input_arg_names():
+                if input_name not in param_to_idx:
+                    continue
+
+                # param is real used by fp32 op
+                if op.type != 'cast':
+                    remove_param(input_name)
+                    continue
+
+                # param is only used by cast op,
+                # which to cast fp32_param to fp16_param
+                output_name = op.output_arg_names[0]
+                if 'cast_fp16' not in output_name:
+                    remove_param(input_name)
+                    continue
+
+                if 'subprog' not in output_name:
+                    assert output_name == input_name + '.cast_fp16'
+                    assert input_name not in param_to_fp16, \
+                        "There must be only one cast op from fp32 param to fp16 param."
+                    param_to_fp16[input_name] = output_name
+                else:
+                    # fp16-->recompute_var
+                    assert input_name in param_to_fp16, \
+                        "param must first be cast to fp16"
+                    fp16_param = param_to_fp16[input_name]
+                    fp16_param_to_recompute[fp16_param] = output_name
+                    recompute_to_fp16[output_name] = fp16_param
+
+        param_name_to_offload_name = dict()
+        # step3: main_block add offload, cast op
+        # change recompute to fp16, remove cast(param) to fp16
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type in ('adam', 'momentum', 'lars', 'lamb'):
+                param = op.desc.input("Param")[0]
+                if param not in param_to_idx: continue
+                # step3.1: create offload_var
+                offload_var_name = self._get_offload_var_name(param)
+                param_name_to_offload_name[param] = offload_var_name
+                self._create_offload_var(param, offload_var_name,
+                                         [block, startup_block])
+
+                # step3.2: insert cast op and offload op
+                self._insert_offload_op(block, idx + 1, param, offload_var_name)
+
+                assert param in param_to_fp16
+                fp16_param_name = param_to_fp16[param]
+                fp16_param_var = block.var(fp16_param_name)
+                fp16_param_var.persistable = True
+                self._insert_cast_op(block, idx + 1, param,
+                                     param_to_fp16[param])
+
+                # step3.3: insert fetch op
+                self._insert_fetch_op(block, idx, offload_var_name, param)
+                continue
+
+            # step3.4: remove cast op
+            if op.type == 'cast':
+                input_name = op.desc.input_arg_names()[0]
+                if input_name in param_to_idx:
+                    block._remove_op(idx, sync=False)
+                    continue
+
+            # step3.5: change recompute_param to fp16_param
+            for input_name in op.desc.input_arg_names():
+                if input_name in recompute_to_fp16:
+                    op._rename_input(input_name, recompute_to_fp16[input_name])
+            for output_name in op.desc.output_arg_names():
+                if output_name in recompute_to_fp16:
+                    op._rename_output(output_name,
+                                      recompute_to_fp16[output_name])
+
+        # step4: remove recompute_param
+        for name in recompute_to_fp16.keys():
+            block._remove_var(name, sync=False)
+
+        # step5: startup_block add offload
+        visited_vars = set()
+        for idx, op in reversed(list(enumerate(startup_block.ops))):
+            for out_name in op.output_arg_names:
+                if out_name in visited_vars:
+                    continue
+
+                if out_name in param_name_to_offload_name:
+                    var_name = out_name
+                    offload_var_name = param_name_to_offload_name[var_name]
+                    self._insert_offload_op(startup_block, idx + 1, var_name,
+                                            offload_var_name)
+                    self._insert_cast_op(startup_block, idx + 1, var_name,
+                                         param_to_fp16[var_name])
+
+                visited_vars.add(out_name)
+
+        block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+    def offload(self, block, startup_block):
+        """
+        (m1, m2) = prefetch(m1@offload, m2@offload)
+        (m1out, m2out, pout) = adam(m1, m2, p)
+        (m1@offload, m2@offload) = memcpy(m1, m2)
+        """
+        vars_name_to_offload_name = dict()
+
+        # main_block add offload
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not is_optimizer_op(op):
+                break
+
+            vars_name = []
+            if op.type == "adam":
+                # {Moment1Out = [''], Moment2Out = [''], ParamOut = ['']} =
+                # adam(inputs={Moment1 = [''], Moment2 = [''], Param = ['']})
+                vars_name.append(op.desc.input("Moment1")[0])
+                vars_name.append(op.desc.input("Moment2")[0])
+            elif op.type == 'momentum':
+                pass
+            elif op.type == 'lars':
+                pass
+            elif op.type == 'lamb':
+                pass
+
+            # step1: create and init offload_var
+            for var_name in vars_name:
+                assert var_name not in vars_name_to_offload_name
+
+                offload_var_name = self._get_offload_var_name(var_name)
+                vars_name_to_offload_name[var_name] = offload_var_name
+
+                self._create_offload_var(var_name, offload_var_name,
+                                         [block, startup_block])
+
+            # step2: insert offload op
+            for var_name in vars_name:
+                offload_var_name = vars_name_to_offload_name[var_name]
+                self._insert_offload_op(block, idx + 1, var_name,
+                                        offload_var_name)
+
+            # step3: insert fetch op
+            for var_name in vars_name:
+                offload_var_name = vars_name_to_offload_name[var_name]
+                self._insert_fetch_op(block, idx, offload_var_name, var_name)
+
+        # startup_block add offload
+        visited_vars = set()
+        for idx, op in reversed(list(enumerate(startup_block.ops))):
+            for out_name in op.output_arg_names:
+                if out_name in visited_vars:
+                    continue
+
+                if out_name in vars_name_to_offload_name:
+                    var_name = out_name
+                    offload_var_name = vars_name_to_offload_name[var_name]
+                    # insert offload op after var is generated
+                    self._insert_offload_op(startup_block, idx + 1, var_name,
+                                            offload_var_name)
+                visited_vars.add(out_name)
+
+        block._sync_with_cpp()
+        startup_block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
old mode 100644
new mode 100755
index 70753b59ccc318..5a43367cf1ad12
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -126,6 +126,10 @@ def remove_op(self, op_idx):
 
     def should_remove_op(self, op_idx):
         op = self._block.ops[op_idx]
+        # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+        # remove check_finite_and_unscale op if its input 'X' is empty
+        if op.type == 'check_finite_and_unscale' and len(op.input('X')) == 0:
+            return True
         for output_name in op.desc.output_arg_names():
             if output_name not in self._should_removed_var:
                 return False
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index ad1cd4f60826bb..f4ceb2d287a56c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -28,21 +28,24 @@ def check_broadcast(block):
     if the broadcasted var has a fill_constant op, the fill_constant
     op should stay forward before the broadcast op, and before a
     sync_calc op. Otherwise, raise error.
+
+    should ignore and skip broadcast_op of inner_parallelism (e.g. Megatron)
     """
     broadcast_vars = {}
     for idx, op in enumerate(block.ops):
         if op.type == "c_broadcast":
-            var_name = op.desc.input_arg_names()[0]
-            if "@BroadCast" in var_name:
-                if var_name in broadcast_vars:
-                    raise ValueError("var_name areadly exist: {}"
-                                     "the old pos is {}, the new pos is {}".
-                                     format(var_name, broadcast_vars[var_name][
-                                         "broadcast_pos"], idx))
-                broadcast_vars[var_name] = {
-                    "fill_constant_pos": -1,
-                    "broadcast_pos": idx,
-                }
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                if "@BroadCast" in var_name:
+                    if var_name in broadcast_vars:
+                        raise ValueError("var_name areadly exist: {}"
+                                         "the old pos is {}, the new pos is {}".
+                                         format(var_name, broadcast_vars[
+                                             var_name]["broadcast_pos"], idx))
+                    broadcast_vars[var_name] = {
+                        "fill_constant_pos": -1,
+                        "broadcast_pos": idx,
+                    }
 
     for idx, op in enumerate(block.ops):
         if op.type == "fill_constant":
@@ -61,14 +64,15 @@ def check_broadcast(block):
             last_sync_calc_op_idx = idx
             continue
         if op.type == "c_broadcast":
-            var_name = op.desc.input_arg_names()[0]
-            if "@BroadCast" in var_name:
-                if broadcast_vars[var_name]["fill_constant_pos"] != -1:
-                    assert (last_sync_calc_op_idx != -1)
-                    assert (broadcast_vars[var_name]["fill_constant_pos"] <
-                            last_sync_calc_op_idx)
-                    assert (last_sync_calc_op_idx < idx)
-                continue
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                if "@BroadCast" in var_name:
+                    if broadcast_vars[var_name]["fill_constant_pos"] != -1:
+                        assert (last_sync_calc_op_idx != -1)
+                        assert (broadcast_vars[var_name]["fill_constant_pos"] <
+                                last_sync_calc_op_idx)
+                        assert (last_sync_calc_op_idx < idx)
+                    continue
         for input_name in op.desc.input_arg_names():
             if input_name in broadcast_vars:
                 assert (broadcast_vars[input_name]["broadcast_pos"] != -1)
@@ -78,43 +82,48 @@ def check_broadcast(block):
     return
 
 
-def check_allreduce_sum(block, shard, dp_ring_id=-1):
+def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
     """
     the op order should be:
         grad:
             - 0: op that generate Var
             - 1: sync_calc
-            - 2: allreduce_sum_sharding
+            - 2: reduce_sum_sharding (allreduce --> reduce)
             - 3: sync_comm
             - 4: allreuce_sum_dp (dp_grads)
             - 5: sync_comm (dp_grads)
             - 6: op that use Var (dp_grads & sum)
+
+    should ignore and skip allreduce_op of inner_parallelism (e.g. Megatron)
     """
     vars_status = {}
     dp_grads_status = {}
     idx_last_grad_allreduce = -1
     idx_amp_allreduce = -1
     idx_gradient_clip_allreduce = -1
+
     for idx, op in enumerate(block.ops):
-        if op.type == "c_allreduce_sum":
-            ring_id = op.desc.attr("ring_id")
-            var_name = op.desc.input_arg_names()[0]
-            param = var_name.split("@")[0]
+        # sharding use both allreduce and reduce to sync grad
+        if op.type == "c_allreduce_sum" or op.type == "c_reduce_sum":
+            if op.all_attrs()["use_calc_stream"] == False:
+                ring_id = op.desc.attr("ring_id")
+                var_name = op.desc.input_arg_names()[0]
+                param = var_name.split("@")[0]
 
-            assert 'sum' in var_name or ("@GRAD" in var_name)
-            if 'sum' in var_name or (not shard.has_param(param)):
-                vars_status[var_name] = -1
-            else:
-                dp_grads_status[var_name] = -1
+                assert 'sum' in var_name or ("@GRAD" in var_name)
+                if 'sum' in var_name or (not shard.has_param(param)):
+                    vars_status[var_name] = -1
+                else:
+                    dp_grads_status[var_name] = -1
 
-            if ring_id != 0:
-                assert shard.has_param(param)
-                assert ring_id == dp_ring_id
+                if ring_id != sharding_ring_id:
+                    assert shard.has_param(param)
+                    assert ring_id == dp_ring_id
 
-            if "sum" in var_name:
-                idx_amp_allreduce = idx
-            elif "@GRAD":
-                idx_last_grad_allreduce = idx
+                if "sum" in var_name:
+                    idx_amp_allreduce = idx
+                elif "@GRAD":
+                    idx_last_grad_allreduce = idx
 
         if op.type == "c_allreduce_max":
             idx_gradient_clip_allreduce = idx
@@ -128,38 +137,41 @@ def check_allreduce_sum(block, shard, dp_ring_id=-1):
                 if var_name in dp_grads_status and dp_grads_status[
                         var_name] == 0:
                     dp_grads_status[var_name] = 1
-
-        elif op.type == "c_allreduce_sum":
-            var_name = op.desc.input_arg_names()[0]
-            ring_id = op.desc.attr("ring_id")
-            if ring_id == 0:
-                if var_name in vars_status:
-                    _status = vars_status[var_name]
-                else:
-                    _status = dp_grads_status[var_name]
-                if _status == -1:
-                    raise ValueError("{} is not generated, but you are"
-                                     "trying to all-reduce it".format(var_name))
-                if _status == 0:
-                    raise ValueError("There should be a sync_calc op "
-                                     "after generate Var: {} and before the"
-                                     "c_allreduce_sum op".format(var_name))
-                assert (_status == 1)
-                if var_name in vars_status:
-                    vars_status[var_name] = 2
+        # check sharding allreduce and  reduce but skip megatron allreduce
+        elif op.type == "c_allreduce_sum" or op.type == "c_reduce_sum":
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                ring_id = op.desc.attr("ring_id")
+                if ring_id == sharding_ring_id:
+                    assert op.type == "c_reduce_sum", "Grad in Sharding group should be reduce rather than allreduce"
+                    if var_name in vars_status:
+                        _status = vars_status[var_name]
+                    else:
+                        _status = dp_grads_status[var_name]
+                    if _status == -1:
+                        raise ValueError("{} is not generated, but you are"
+                                         "trying to all-reduce it".format(
+                                             var_name))
+                    if _status == 0:
+                        raise ValueError("There should be a sync_calc op "
+                                         "after generate Var: {} and before the"
+                                         "c_allreduce_sum op".format(var_name))
+                    assert (_status == 1)
+                    if var_name in vars_status:
+                        vars_status[var_name] = 2
+                    else:
+                        dp_grads_status[var_name] = 2
                 else:
-                    dp_grads_status[var_name] = 2
-            else:
-                assert ring_id == dp_ring_id
-                param = var_name.split("@")[0]
-                assert shard.has_param(param)
-                assert dp_grads_status[var_name] == 3
-                dp_grads_status[var_name] = 4
+                    assert ring_id == dp_ring_id
+                    param = var_name.split("@")[0]
+                    assert shard.has_param(param)
+                    assert dp_grads_status[var_name] == 3
+                    dp_grads_status[var_name] = 4
 
         elif op.type == "c_sync_comm_stream":
             var_name = op.desc.input_arg_names()[0]
             ring_id = op.desc.attr("ring_id")
-            if ring_id == 0:
+            if ring_id == sharding_ring_id:
                 for var_name in op.desc.input_arg_names():
                     if var_name in vars_status:
                         assert vars_status[var_name] == 2
@@ -181,6 +193,9 @@ def check_allreduce_sum(block, shard, dp_ring_id=-1):
                         raise ValueError("There should be a sync_comm op "
                                          "after allreduce the Var: {}".format(
                                              input_name))
+                    raise ValueError(
+                        "The reduce output grad [{}] should NOT be be used in Non-root rank.".
+                        format(input_name))
                 if input_name in dp_grads_status:
                     if dp_ring_id == -1:
                         if dp_grads_status[input_name] != 3:
@@ -259,6 +274,10 @@ def insert_sync_comm_ops(block, insert_idx, ring_id, comm_dep_vars):
     """
     insert sync_comm_op for vars
     """
+    # NOTE (JZ-LIANG) to be check, may result undefined case 
+    if len(comm_dep_vars) == 0:
+        return 0
+
     op_role = get_valid_op_role(block, insert_idx)
     block._insert_op_without_sync(
         insert_idx,
@@ -309,22 +328,89 @@ def insert_cast_ops(block, insert_idx, cast_ops):
     return
 
 
-def insert_allreduce_ops(block, insert_idx, ring_id, allreduce_vars):
+def insert_allreduce_ops(block,
+                         insert_idx,
+                         ring_id,
+                         allreduce_vars,
+                         op_role=OpRole.Backward,
+                         use_calc_stream=False):
     """
     _add_allreduce_ops
     """
+    if len(allreduce_vars) == 0:
+        return
+
     for var in allreduce_vars:
         block._insert_op_without_sync(
             insert_idx,
             type='c_allreduce_sum',
             inputs={'X': var},
             outputs={'Out': var},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Backward})
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+                OP_ROLE_KEY: op_role
+            })
+
+    return
+
+
+def insert_reduce_ops(block,
+                      insert_idx,
+                      ring_id,
+                      reduce_vars,
+                      shard,
+                      op_role=OpRole.Backward,
+                      use_calc_stream=False):
+    """
+    _add_allreduce_ops
+    """
+    for var in reduce_vars:
 
+        root_id = get_grad_device(var, shard)
+        assert root_id >= 0, "root id should be a positive int".format(var)
+        block._insert_op_without_sync(
+            insert_idx,
+            type='c_reduce_sum',
+            inputs={'X': var},
+            outputs={'Out': var},
+            attrs={
+                'ring_id': ring_id,
+                'root_id': root_id,
+                'use_calc_stream': use_calc_stream,
+                OP_ROLE_KEY: op_role
+            })
     return
 
 
+def get_grad_device(grad_name, shard):
+    assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
+        grad_name)
+    base_name = None
+    # mind the traversal order 
+    possible_suffixes = [
+        '.cast_fp16@GRAD@MERGED', '.cast_fp16@GRAD', '@GRAD@MERGED', '@GRAD'
+    ]
+    for suffix in possible_suffixes:
+        if suffix in grad_name:
+            base_name = re.sub(suffix, '', grad_name)
+            break
+
+    assert base_name in shard.global_param2device, "[{}] should be a param variable.".format(
+        base_name)
+
+    return shard.global_param2device[base_name]
+
+
+def get_first_check_finite_and_unscale_op_idx(block):
+
+    for idx, op in enumerate(block.ops):
+        if op.type == "check_finite_and_unscale":
+            return idx
+
+    raise ValueError("check_finite_and_unscale does not exist in block")
+
+
 def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
     """
     _add_broadcast_ops
@@ -384,6 +470,7 @@ def insert_scale_loss_grad_ops(block, scale=1.0):
                 outputs={'Out': loss_grad_var},
                 attrs={'scale': scale,
                        OP_ROLE_KEY: OpRole.Backward})
+            break
 
 
 def comm_analyse(main_program):
@@ -428,7 +515,7 @@ def comm_analyse(main_program):
                                                       count))
 
 
-def add_sync_comm(program, dist_strategy):
+def add_sync_comm(program, sharding_ring_id):
     """
     When clone a test prog by clone from the sharding main prog, 
     part of the sync_comm op maybe be pruned by mistake, this function
@@ -438,6 +525,7 @@ def add_sync_comm(program, dist_strategy):
     #NOTE (liangjianzhong): only support one comm stream by now, use more than one 
     # comm streams will cause error. should be revise in future.
 
+    assert sharding_ring_id >= 0, "sharding_ring_id should larger than zero"
     block = program.global_block()
     not_sync_vars = set([])
     for op in block.ops:
@@ -448,15 +536,14 @@ def add_sync_comm(program, dist_strategy):
             for input_name in op.desc.input_arg_names():
                 not_sync_vars.remove(input_name)
     if not_sync_vars:
-        for nccl_id in range(dist_strategy.nccl_comm_num):
-            block.append_op(
-                type='c_sync_comm_stream',
-                inputs={'X': list(not_sync_vars)},
-                outputs={'Out': list(not_sync_vars)},
-                attrs={
-                    'ring_id': nccl_id,
-                    'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-                })
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': list(not_sync_vars)},
+            outputs={'Out': list(not_sync_vars)},
+            attrs={
+                'ring_id': sharding_ring_id,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            })
     return
 
 
@@ -466,9 +553,12 @@ def save_persistables(exe, dirname, main_program, filename=None):
     and part of persistable vars are duplicated and exist in all the ranks with different values.
     This function handles the model saving for sharding training.
     """
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    if main_program._pipeline_opt:
+        main_program = main_program._pipeline_opt['section_program']['program']
 
     def is_opt_vars(var):
-        # NOTE(liangjianzhong): The checks should be updated when add new compatible optimizer
+        # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
         # now only Momentum and adam are compatible with sharding
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
@@ -479,12 +569,18 @@ def is_opt_vars(var):
                 return True
         return False
 
+    def is_gradient_merge_vars(var):
+        # NOTE(JZ-LIANG): to revise save/load logic in framework instead of write this naive rule
+
+        return var.name.endswith("@GradiantMerge")
+
     def is_trainable(var):
         return isinstance(var,
                           paddle.fluid.framework.Parameter) and var.trainable
 
     def sharding_predicate(var):
-        return is_trainable(var) or is_opt_vars(var)
+        return is_trainable(var) or is_opt_vars(var) or is_gradient_merge_vars(
+            var)
 
     if int(os.environ.get('PADDLE_TRAINER_ID', 0)) == 0:
         paddle.fluid.io.save_persistables(
@@ -498,3 +594,42 @@ def sharding_predicate(var):
             filename=None)
 
     return
+
+
+def get_grad_device(grad_name, shard):
+    assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
+        grad_name)
+    base_name = None
+    # mind the traversal order 
+    possible_suffixes = ['.cast_fp16@GRAD', '@GRAD']
+    for suffix in possible_suffixes:
+        if suffix in grad_name:
+            base_name = re.sub(suffix, '', grad_name)
+            break
+
+    assert base_name in shard.global_param2device, "[{}] should be a param variable.".format(
+        base_name)
+
+    return shard.global_param2device[base_name]
+
+
+def append_naive_sync(block, sync_var, ring_id):
+    # NOTE (JZ-LIANG) update this to use barrier sync for more elegent logic
+    # sync within global 
+    block.append_op(
+        type="fill_constant",
+        outputs={"Out": sync_var},
+        attrs={
+            "shape": sync_var.shape,
+            "dtype": sync_var.dtype,
+            "value": int(1),
+        })
+    block.append_op(
+        type='c_allreduce_sum',
+        inputs={'X': sync_var},
+        outputs={'Out': sync_var},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            OP_ROLE_KEY: OpRole.Forward
+        })
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index a7f704361d31af..2c4ad33c361e01 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -12,25 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 from paddle.fluid import unique_name, core
 import paddle.fluid as fluid
-
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper
-from paddle.distributed.fleet.meta_optimizers.common import is_backward_op
+from paddle.distributed.fleet.meta_optimizers.common import is_backward_op, is_optimizer_op, is_update_op
 from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase
 from paddle.distributed.fleet.meta_optimizers.sharding.shard import Shard, ProgramSegment
 from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils
 from paddle.distributed.fleet.meta_optimizers.sharding.weight_decay_helper import WeightDecayHelper
 from paddle.distributed.fleet.meta_optimizers.sharding.gradient_clip_helper import GradientClipHelper
+from .sharding.offload_helper import OffloadHelper
 from paddle.distributed.fleet.meta_optimizers.sharding.prune import ProgramDeps
 from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+from paddle.fluid import layers
+
 import logging
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-8s %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S')
 from functools import reduce
 
 __all__ = ["ShardingOptimizer"]
 
 
 class ShardingOptimizer(MetaOptimizerBase):
+    """Sharding Optimizer."""
+
     def __init__(self, optimizer):
         super(ShardingOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -39,6 +48,8 @@ def __init__(self, optimizer):
             "AMPOptimizer",
             "LarsOptimizer",
             "LambOptimizer",
+            # "ModelParallelOptimizer",
+            # "PipelineOptimizer",
         ]
         self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
         self._main_program = None
@@ -50,6 +61,10 @@ def __init__(self, optimizer):
         # reduced grads to param name
         self._reduced_grads_to_param = {}
         self._shard = Shard()
+        self._verbose = False
+
+        # use sharding as outer parallelism (e.g. inner:Megatron & outer sharding)
+        self.mp_degree = 1
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -64,7 +79,7 @@ def _disable_strategy(self, dist_strategy):
 
     def _enable_strategy(self, dist_strategy, context):
         dist_strategy.sharding = True
-        dist_strategy.sharding_configs = {"fuse_broadcast_MB": 32}
+        dist_strategy.sharding_configs = {"segment_broadcast_MB": 32}
 
     def minimize_impl(self,
                       loss,
@@ -75,104 +90,478 @@ def minimize_impl(self,
         # self._nrings = self.user_defined_strategy.nccl_comm_num
         self._nrings_sharding = 1
         self._nrings_dp = 1
-        self._fuse_broadcast_MB = self.user_defined_strategy.sharding_configs[
-            "fuse_broadcast_MB"]
-        self.hybrid_dp = self.user_defined_strategy.sharding_configs[
-            "hybrid_dp"]
+
+        # segment
+        self._sharding_segment_strategy = str(
+            self.user_defined_strategy.sharding_configs[
+                "sharding_segment_strategy"])
+        if self._sharding_segment_strategy == "segment_broadcast_MB":
+            self._broadcast_MB = self.user_defined_strategy.sharding_configs[
+                "segment_broadcast_MB"]
+            assert self._broadcast_MB > 0, "segment size should larger than zero !"
+        elif self._sharding_segment_strategy == "segment_anchors":
+            self._sharding_segment_anchors = self.user_defined_strategy.sharding_configs[
+                "segment_anchors"]
+            assert len(self._sharding_segment_anchors
+                       ) > 0, "you should set the sharding segment anchors !"
+            self._backward_remain_anchors = self._sharding_segment_anchors[:]
+            self._forward_remain_anchors = []
+        else:
+            raise NotImplementedError(
+                "the sharding segment strategy [{}] is not implemented".format(
+                    str(self._sharding_segment_strategy)))
+
+        # parallelism
+        self.sharding_degree = int(self.user_defined_strategy.sharding_configs[
+            "sharding_degree"])
+        assert self.sharding_degree > 0, "sharding degree must be larger than zero"
+        self.mp_degree = int(self.user_defined_strategy.sharding_configs[
+            "mp_degree"])
+        # pipeline setting
+        # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
+        self.pp_degree = int(self.user_defined_strategy.sharding_configs[
+            "pp_degree"])
+        if self.pp_degree > 1:
+            assert self.user_defined_strategy.pipeline == True
+
+        self.dp_degree = int(self.user_defined_strategy.sharding_configs[
+            'dp_degree'])
+        assert self.role_maker._worker_num(
+        ) == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
+            self.role_maker._worker_num(),
+            self.mp_degree,
+            self.sharding_degree,
+            self.pp_degree,
+            self.dp_degree, )
+
+        # FIXME (JZ-LIANG) deprecated hybrid_dp
+        if self.user_defined_strategy.sharding_configs["hybrid_dp"]:
+            logging.warning(
+                "[hybrid_dp] API setting is deprecated. Now when dp_degree >= 2, its will be in hybrid dp mode automatically"
+            )
+            assert self.dp_degree >= 1
+        if self.dp_degree > 1:
+            self.hybrid_dp = True
+        else:
+            self.hybrid_dp = False
+
+        # NOTE (JZ-LIANG) 
+        # there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline].
+        # we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance:
+        # sharding: communication within node, and therefore should insert within backward segment to overlap with bw calc, conduct every micro step 
+        # pipeline: communication accross nodes, and therefore should insert in update segemnt, conduct just once per global step        
+        self.hybrid_dp_mode = None
+        # dp here is the pure dp as the outest parallelism
+        if self.hybrid_dp:
+            assert self.dp_degree > 1, "hybrid dp is on, but dp degree is [{}]".format(
+                self.dp_degree)
+            if self.pp_degree > 1:
+                self.hybrid_dp_mode = "pp_hybrid_dp"
+            else:
+                assert self.sharding_degree > 1, "by now we only support five kind of hybrid dp: sharding_hybrid_dp, mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp."
+                self.hybrid_dp_mode = "sharding_hybrid_dp"
+
+        # gradient merge
+        self._gradient_merge_acc_step = int(
+            self.user_defined_strategy.sharding_configs[
+                "gradient_merge_acc_step"])
+        self.gradient_merge_mode = None
+        if self.pp_degree <= 1:
+            self.gradient_merge_mode = "sharding_gm"
+            self._grad2merged_grad = dict()
+        else:
+            self.gradient_merge_mode = "pp_gm"
+            self._gradient_merge_acc_step = self.user_defined_strategy.pipeline_configs[
+                'accumulate_steps']
+        if self._gradient_merge_acc_step > 1:
+            logging.info("Gradient merge in [{}], acc step = [{}]".format(
+                self.gradient_merge_mode, self._gradient_merge_acc_step))
+
+        # optimize offload
+        self.optimize_offload = self.user_defined_strategy.sharding_configs[
+            "optimize_offload"]
+
+        # this feature is design for ascend, and should NOT be used in GPU training
+        self.pp_allreduce_in_optimize = self.user_defined_strategy.sharding_configs[
+            "pp_allreduce_in_optimize"]
 
         if self.inner_opt is None:
             raise ValueError(
                 "self.inner_opt of ShardingOptimizer should not be None.")
-        optimize_ops, params_grads = self.inner_opt.minimize(
-            loss, startup_program, parameter_list, no_grad_set)
+
+        if self.pp_degree > 1:
+            pp_optimizer = fluid.optimizer.PipelineOptimizer(
+                self.inner_opt, self._gradient_merge_acc_step)
+            main_program = loss.block.program
+            main_program._pipeline_opt = dict()
+            self.schedule_mode = self.user_defined_strategy.pipeline_configs[
+                'schedule_mode']
+            main_program._pipeline_opt['schedule_mode'] = self.schedule_mode
+            main_program._pipeline_opt[
+                'micro_batch_size'] = self.user_defined_strategy.pipeline_configs[
+                    'micro_batch_size']
+            self.pp_rank_ = self.role_maker._worker_index() // (
+                self.sharding_degree * self.mp_degree) % self.pp_degree
+            main_program._pipeline_opt['local_rank'] = self.pp_rank_
+            main_program._pipeline_opt[
+                'global_rank'] = self.role_maker._worker_index()
+            main_program._pipeline_opt['use_sharding'] = True
+            # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
+            main_program._pipeline_opt['ring_id'] = 20
+            main_program._pipeline_opt['global_ring_id'] = 3
+
+            optimize_ops, params_grads, program_list, self.pipeline_pair, self.pp_ring_map = pp_optimizer.minimize(
+                loss, startup_program, parameter_list, no_grad_set)
+            self.pp_degree = len(program_list)
+        else:
+            optimize_ops, params_grads = self.inner_opt.minimize(
+                loss, startup_program, parameter_list, no_grad_set)
 
         if startup_program is None:
             startup_program = default_startup_program()
-        main_block = loss.block
+
+        if self.pp_degree > 1:
+            startup_program = startup_program._pipeline_opt['startup_program']
+            #main_program = main_program._pipeline_opt['section_program']['program']
+            print("pp_rank:", self.pp_rank_)
+            main_program = program_list[self.pp_rank_]
+            with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
+                f.writelines(str(main_program))
+            main_block = main_program.global_block()
+            new_params_grads = []
+            for param, grad in params_grads:
+                if main_block.has_var(param.name):
+                    new_params_grads.append((param, grad))
+            params_grads = new_params_grads
+
+        else:
+            main_block = loss.block
+
         startup_block = startup_program.global_block()
         self._main_program = main_block.program
         self._startup_program = startup_program
 
-        # step1: set_up
-        self._set_up(params_grads)
+        if self.pp_degree > 1:
+            pp_optimizer._rename_gradient_var_name(main_block)
+            with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
+                f.writelines(str(main_program))
+
+        # step0: _init_comm
+        self._init_comm()
 
-        # step2: split_program
-        self._split_program(main_block)
+        if self.sharding_degree > 1:
+
+            # step1: build shard
+            self._build_shard(params_grads)
+
+            # step2: split_program
+            self._split_program(main_block)
+
+            # step3: add broadcast and reduce ops
+            self._add_broadcast_allreduce(main_block)
+            main_block._sync_with_cpp()
+            startup_block._sync_with_cpp()
+
+            main_block._sync_with_cpp()
+
+            # step4: remove unneeded ops and vars from block
+            self._prune_main_program(main_block)
+            self._prune_startup_program(startup_block)
+
+        if self.pp_degree > 1:
+            # sharding-pp related logic
+            # pp_optimizer._rename_gradient_var_name(main_block)
+            # crop ops
+            if self.sharding_degree > 1:
+                for idx, op in reversed(list(enumerate(main_block.ops))):
+                    if is_update_op(op):
+                        op_role_var = op.attr('op_role_var')
+                        param_name = op_role_var[0]
+                        if not self._shard.has_param(param_name):
+                            main_block._remove_op(idx)
+
+                for idx, op in reversed(list(enumerate(main_block.ops))):
+                    if op.type != 'cast': continue
+                    in_name = op.input_arg_names[0]
+                    if in_name not in self._params: continue
+                    #if self._shard.has_param(param_name): continue
+                    if in_name not in main_block.vars:
+                        main_block._remove_op(idx)
+
+            accumulated_grad_names = pp_optimizer._accumulate_gradients(
+                main_block)
+            # accumulated_grad_names = sorted(accumulated_grad_names)
+            if self.pp_allreduce_in_optimize:
+                print("persistable FP32 grad: ")
+                print(accumulated_grad_names)
+                first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
+                    main_block)
+                insert_reduce_ops(
+                    main_block,
+                    first_optimize_op_index,
+                    self.sharding_ring_id,
+                    accumulated_grad_names,
+                    self._shard,
+                    core.op_proto_and_checker_maker.OpRole.Optimize,
+                    use_calc_stream=True)
+            if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
+                first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
+                    main_block)
+                insert_allreduce_ops(
+                    main_block,
+                    first_optimize_op_index,
+                    self.dp_ring_id,
+                    accumulated_grad_names,
+                    core.op_proto_and_checker_maker.OpRole.Optimize,
+                    use_calc_stream=True)
+
+        # if not use sharding, adapt amp/clip, for remain parallelism.
+        # cast --> amp --> clip --> opt
+        if self.sharding_degree <= 1:
+            # amp
+            FP16Utils.sync_amp_check_nan_inf(main_block, self.global_ring_id)
+
+            # clip
+            gradientclip_helper = GradientClipHelper(self.global_ring_id)
+            gradientclip_helper.sync_global_norm(
+                main_block, self.global_ring_id, self.dp_degree)
+
+        # step6: loss div dp_degree 
+        global_dp_degree = self.sharding_degree * self.dp_degree
+        assert int(global_dp_degree) == global_dp_degree
+        if global_dp_degree > 1:
+            insert_scale_loss_grad_ops(main_block, scale=1.0 / global_dp_degree)
 
-        # step3: add broadcast and reduce ops
-        self._add_broadcast_allreduce(main_block)
         main_block._sync_with_cpp()
-        startup_block._sync_with_cpp()
 
-        # step4: insert reduce_sum for grad
-        insert_scale_loss_grad_ops(
-            main_block, scale=1.0 / self.role_maker._worker_num())
-        main_block._sync_with_cpp()
+        # TODO(wangxi): add optimize offload
+        # opt offload should be enable while gradient merge is enable && acc_step is quite large (e.g. >> 100) 
+        # sync its memcpy could not be overlap with calc, otherwise it will slower down training severely. 
+        if self.optimize_offload:
+            logging.info("Sharding with optimize offload !")
+            offload_helper = OffloadHelper()
+            offload_helper.offload(main_block, startup_block)
+            offload_helper.offload_fp32param(main_block, startup_block)
+
+        # step6: (optional) sharding gradient merge
+        if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+            self._sharding_gradient_merge(main_block)
+
+        # # check op dependecy
+        # FIXME (JZ-LIANG) enable checking in future.
+        # check_broadcast(main_block)
+        # check_allreduce_sum(main_block, self._shard, self.sharding_ring_id,
+        #                     self.dp_ring_id)
 
-        # step5: remove unneeded ops and vars from block
-        self._prune_main_program(main_block)
-        self._prune_startup_program(startup_block)
+        if self.hybrid_dp:
+            # NOTE(JZ-LIANG) ensure in both sharding_hybrid_dp & pp_hybrid_dp 
+            # init param broadcast should be called after startup pruning             
+            self._initialization_broadcast(startup_block)
+
+        with open("start_sharding_%d" % self.role_maker._worker_index(),
+                  'w') as f:
+            f.writelines(str(startup_block.program))
+        with open("main_sharding_%d" % self.role_maker._worker_index(),
+                  'w') as f:
+            f.writelines(str(main_block.program))
 
-        # check op dependecy
-        check_broadcast(main_block)
-        check_allreduce_sum(main_block, self._shard, self.dp_ring_id)
         self._wait()
+
         return optimize_ops, params_grads
 
-    def _set_up(self, params_grads):
-        # step 1: initialize nccl
-        self.global_word_size = self.role_maker._worker_num()
-        self.global_rank = self.role_maker._worker_index()
-        self.endpoints = self.role_maker._get_trainer_endpoints()
-        self.current_endpoint = self.endpoints[self.global_rank]
-        self._collective_helper = CollectiveHelper(self.role_maker,
-                                                   self._nrings_sharding)
+    def _init_comm(self):
+
         # config sharding & dp groups
-        self._init_comm()
-        # sharding
+        self._build_groups()
+
+        # sync var
+        startup_block = self._startup_program.global_block()
+        self.startup_prog_sync_var = startup_block.create_var(
+            name="startup_prog_sync_var",
+            shape=[1],
+            dtype=core.VarDesc.VarType.INT32,
+            persistable=False)
+
+        # global ring
         self._collective_helper._init_communicator(
-            self._startup_program, self.current_endpoint,
-            self.sharding_group_endpoints, self.sharding_rank,
-            self.sharding_ring_id, True)
-        # dp
-        if self.hybrid_dp:
+            self._startup_program,
+            self.current_endpoint,
+            self.global_endpoints,
+            self.global_rank,
+            self.global_ring_id,
+            False,
+            global_ring_id=self.global_ring_id,
+            sync=False)
+        append_naive_sync(startup_block, self.startup_prog_sync_var,
+                          self.global_ring_id)
+
+        # mp ring
+        if self.mp_degree > 1:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.mp_group_endpoints,
+                self.mp_rank,
+                self.mp_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
+
+        # sharding ring
+        if self.sharding_degree > 1:
             self._collective_helper._init_communicator(
-                self._startup_program, self.current_endpoint,
-                self.dp_group_endpoints, self.dp_rank, self.dp_ring_id, True)
+                self._startup_program,
+                self.current_endpoint,
+                self.sharding_group_endpoints,
+                self.sharding_rank,
+                self.sharding_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
+
+        # pp ring
+        if self.pp_degree > 1:
+            if self.schedule_mode == 'F-then-B':  # GPipe
+                self._collective_helper._init_communicator(
+                    self._startup_program,
+                    self.current_endpoint,
+                    self.pp_group_endpoints,
+                    self.pp_rank,
+                    self.pp_ring_id,
+                    False,
+                    global_ring_id=self.global_ring_id,
+                    sync=False)
+                # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                #                   self.global_ring_id)
+                self._collective_helper._init_communicator(
+                    self._startup_program,
+                    self.current_endpoint,
+                    self.pp_group_endpoints,
+                    self.pp_rank,
+                    self.pp_ring_id + 2,
+                    False,
+                    global_ring_id=self.global_ring_id,
+                    sync=False)
+                # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                #                   self.global_ring_id)
+            else:
+                assert self.schedule_mode == '1F1B'
+                for pair in self.pipeline_pair:
+                    pair_key = pair[0] * 1000 + pair[1]
+                    ring_id = self.pp_ring_map[pair_key]
+                    print("pp pair:{}, ring_id: {}".format(pair, ring_id))
+                    if self.pp_rank not in pair: continue
+                    pp_group_endpoints = [
+                        self.pp_group_endpoints[pair[0]],
+                        self.pp_group_endpoints[pair[1]],
+                    ]
+                    if pair[0] < pair[1]:
+                        start_ring_id = self.pp_ring_id + pair[1] - pair[0] - 1
+                    else:
+                        start_ring_id = self.pp_ring_id + 2 + pair[0] - pair[
+                            1] - 1
+                    pp_rank = 0 if self.pp_rank == pair[0] else 1
+                    self._collective_helper._init_communicator(
+                        self._startup_program,
+                        self.current_endpoint,
+                        pp_group_endpoints,
+                        pp_rank,
+                        ring_id,
+                        False,
+                        global_ring_id=self.global_ring_id,
+                        sync=False)
+                    # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                    #                   self.global_ring_id)
+
+                # TODO (JZ-LIANG) to unify this shit 
+            assert self.pp_rank_ == self.pp_rank, "pp rank for pp opt [{}], pp rank for sharding opt [{}]".format(
+                self.pp_rank_, self.pp_rank)
+
+        # pure dp ring
+        if self.dp_degree > 1:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.dp_group_endpoints,
+                self.dp_rank,
+                self.dp_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
 
-        startup_block = self._startup_program.global_block()
         startup_block._sync_with_cpp()
 
+    def _build_shard(self, params_grads):
         # step 2: split params
         self._params = set([x[0].name for x in params_grads])
         self._shard.setup(params_grads, self.sharding_rank,
-                          self.sharding_group_size)
+                          self.sharding_degree)
 
         # step 3: get broadcast vars
         self._broadcast_vars = self._shard.find_broadcast_params(
             self._main_program.global_block())
 
     def _wait(self, ):
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker._worker_index()]
-        if self.role_maker._worker_index() == 0:
+        endpoints = self.global_endpoints[:]
+        current_endpoint = endpoints[self.global_rank]
+        if self.global_rank == 0:
             self._collective_helper._wait(current_endpoint, endpoints)
 
+    def collect_segment(self, segment, op_idx, block):
+        segment._start_idx = op_idx + 1
+        self._segments.insert(0, segment)
+        new_segment = ProgramSegment(block)
+        new_segment._end_idx = op_idx + 1
+
+        return new_segment
+
     def _split_program(self, block):
         for op_idx, op in reversed(list(enumerate(block.ops))):
             if int(op.attr('op_role')) != int(OpRole.Optimize):
                 last_backward_op_idx = op_idx + 1
                 break
+
+        var2broadcast_time = dict()
         segment = ProgramSegment(block)
         segment._end_idx = last_backward_op_idx
         for op_idx in reversed(range(last_backward_op_idx)):
             op = block.ops[op_idx]
             assert (int(op.attr('op_role')) != int(OpRole.Optimize))
-            if segment._param_mem >= self._fuse_broadcast_MB:
-                segment._start_idx = op_idx + 1
-                self._segments.insert(0, segment)
-                segment = ProgramSegment(block)
-                segment._end_idx = op_idx + 1
+            if self._sharding_segment_strategy == "segment_broadcast_MB":
+                if segment._param_mem >= self._broadcast_MB:
+                    segment = self.collect_segment(segment, op_idx, block)
+
+            elif self._sharding_segment_strategy == "segment_anchors":
+                if int(op.attr('op_role')) == int(OpRole.Backward):
+                    for input_name in op.desc.input_arg_names():
+
+                        # NOTE (JZ-LIANG) naive rule to support amp, if amp change, should modify here accordingly
+                        if self.user_defined_strategy.amp:
+                            if ".cast_fp16@GRAD" not in input_name:
+                                continue
+                            else:
+                                input_name = input_name[:input_name.find(
+                                    ".cast_fp16@GRAD")]
+
+                        if input_name in self._backward_remain_anchors:
+                            segment = self.collect_segment(segment, op_idx,
+                                                           block)
+                            assert input_name not in self._forward_remain_anchors, "segment anchor [{}] met twice !".format(
+                                input_name)
+                            self._backward_remain_anchors.remove(input_name)
+                            self._forward_remain_anchors.append(input_name)
+                elif int(op.attr('op_role')) == int(OpRole.Forward):
+                    for output_name in op.desc.output_arg_names():
+                        if output_name in self._forward_remain_anchors:
+                            segment = self.collect_segment(segment, op_idx,
+                                                           block)
+                            self._forward_remain_anchors.remove(output_name)
 
             # find broadcast vars
             for input_name in op.desc.input_arg_names():
@@ -190,6 +579,21 @@ def _split_program(self, block):
                     broadcast_var_name = unique_name.generate(input_name +
                                                               "@BroadCast")
                     segment._fill_constant_vars.append(broadcast_var_name)
+
+                # (JZ-LIANG) should use Param base name ?
+                broadcast_var_base_name = input_name
+                if "subprog" in broadcast_var_base_name:
+                    # remove suffix
+                    broadcast_var_base_name = broadcast_var_base_name[:
+                                                                      broadcast_var_base_name.
+                                                                      find(
+                                                                          ".subprog"
+                                                                      )]
+
+                var2broadcast_time[
+                    broadcast_var_base_name] = var2broadcast_time.get(
+                        broadcast_var_base_name, 0) + 1
+
                 segment._param2broadcast[input_name] = broadcast_var_name
                 segment._broadcast_vars.append((broadcast_var_name,
                                                 self._shard.device(input_name)))
@@ -197,17 +601,22 @@ def _split_program(self, block):
                     self._main_program.global_block().var(input_name))
 
             # find reduce vars
-            if is_backward_op(op) and \
-                    OP_ROLE_VAR_KEY in op.attr_names:
-                op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
-                if len(op_role_var) != 0:
-                    assert len(op_role_var) % 2 == 0
-                    for i in range(0, len(op_role_var), 2):
-                        param, reduced_grad = op_role_var[i], op_role_var[i + 1]
-                        segment._allreduce_vars.append(reduced_grad)
-                        assert (
-                            reduced_grad not in self._reduced_grads_to_param)
-                        self._reduced_grads_to_param[reduced_grad] = param
+            if self.pp_degree > 1 and self.pp_allreduce_in_optimize:
+                # place pipeline gradient allreduce in optimize
+                pass
+            else:
+                if is_backward_op(op) and \
+                        OP_ROLE_VAR_KEY in op.attr_names:
+                    op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
+                    if len(op_role_var) != 0:
+                        assert len(op_role_var) % 2 == 0
+                        for i in range(0, len(op_role_var), 2):
+                            param, reduced_grad = op_role_var[i], op_role_var[
+                                i + 1]
+                            segment._allreduce_vars.append(reduced_grad)
+                            assert (reduced_grad not in
+                                    self._reduced_grads_to_param)
+                            self._reduced_grads_to_param[reduced_grad] = param
 
             # find cast op
             if FP16Utils.is_fp16_cast_op(block, op, self._params):
@@ -219,6 +628,30 @@ def _split_program(self, block):
         if segment._param_mem > 0:
             segment._start_idx = 0
             self._segments.insert(0, segment)
+
+        if self._sharding_segment_strategy == "segment_anchors":
+            assert len(
+                self._forward_remain_anchors) == 0, "remain anchors {}".format(
+                    self._forward_remain_anchors)
+            assert len(
+                self._backward_remain_anchors) == 0, "remain anchors {}".format(
+                    self._backward_remain_anchors)
+
+        if self._verbose:
+            for varname in sorted(
+                    var2broadcast_time, key=var2broadcast_time.get,
+                    reverse=True):
+                logging.info("Sharding broadcast: [{}] times [{}]".format(
+                    var2broadcast_time[varname], varname))
+            for idx_ in range(len(self._segments)):
+                logging.info("segment [{}] :".format(idx_))
+                logging.info("start op: [{}]  [{}]".format(block.ops[
+                    self._segments[idx_]._start_idx].desc.type(), block.ops[
+                        self._segments[idx_]._start_idx].desc.input_arg_names(
+                        )))
+                logging.info("end   op: [{}]  [{}]".format(block.ops[
+                    self._segments[idx_]._end_idx].desc.type(), block.ops[
+                        self._segments[idx_]._end_idx].desc.input_arg_names()))
         return
 
     def _prune_main_program(self, block):
@@ -234,10 +667,21 @@ def _prune_main_program(self, block):
         """
         weightdecay_helper = WeightDecayHelper()
         weightdecay_helper.prune_weight_decay(block, self._shard)
+        # NOTE (JZ-LIANG) the sync of FoundInfinite should among one entire Model Parallelism
+        # group. and each Data Parallelism group should have its own sync of FoundInfinite
+        # amp could use global group for sync
         FP16Utils.prune_fp16(block, self._shard, self._reduced_grads_to_param,
-                             self.sharding_ring_id)
-        gradientclip_helper = GradientClipHelper(self.sharding_ring_id)
-        gradientclip_helper.prune_gradient_clip(block, self._shard)
+                             self.global_ring_id)
+        # clipbyglobalnorm should only use the Model paramllelism group (mp-sharding-pp)
+        if self.mp_degree * self.pp_degree == 1:
+            # separate the sharding-hybrid senario to keep the accuracy
+            gradientclip_helper = GradientClipHelper(self.sharding_ring_id)
+            gradientclip_helper.prune_gradient_clip(
+                block, self._shard, pure_dp_degree=1)
+        else:
+            gradientclip_helper = GradientClipHelper(self.global_ring_id)
+            gradientclip_helper.prune_gradient_clip(
+                block, self._shard, pure_dp_degree=self.dp_degree)
 
         # build prog deps
         reduced_grads = []
@@ -264,8 +708,13 @@ def _prune_main_program(self, block):
         # Prune
         for idx, op in reversed(list(enumerate(block.ops))):
             if op.type in [
-                    "c_allreduce_sum", "c_sync_comm_stream",
-                    "c_calc_comm_stream", "c_gen_nccl_id", "c_comm_init"
+                    "c_allreduce_sum",
+                    "c_sync_comm_stream",
+                    "c_calc_comm_stream",
+                    "c_gen_nccl_id",
+                    "c_comm_init",
+                    'send_v2',
+                    'recv_v2',
             ]:
                 pass
             elif op.type == "conditional_block":
@@ -302,30 +751,76 @@ def _prune_main_program(self, block):
                 if program_deps.should_remove_op(idx):
                     program_deps.remove_op(idx)
 
+        # NOTE (JZ-LIANG) revise and unify logic here
+        # sharding support fp16_allreduce logic            
+        block._sync_with_cpp()
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type == 'concat' and is_optimizer_op(op):
+                # remove inputs that not on this card
+                reserved_x = []
+                for var_name in op.desc.input("X"):
+                    if block.has_var(var_name): reserved_x.append(var_name)
+                op.desc.set_input('X', reserved_x)
         block._sync_with_cpp()
         return
 
     def _add_broadcast_allreduce(self, block):
         """
-        _add_broadcast_allreduce
+        add broadcast allreduce op
+        if enable gradient_merge, insert related ops
+
+        if combined with pipeline(grad accumulate), 
+        the grad allreduce should be done in optimize role
         """
         if len(self._segments) < 1:
             return
         # sharding
+        if self.pp_degree > 1 and self.pp_allreduce_in_optimize:
+            for idx in range(len(self._segments)):
+                assert len(self._segments[idx]._allreduce_vars) == 0
+
+        # NOTE (JZ-LIANG) revise and unify logic here
+        # fix the _end_idx for segments[-1] if pp is used.
+        new_end_idx = self._segments[-1]._end_idx
+        for idx in range(self._segments[-1]._end_idx - 1,
+                         self._segments[-1]._start_idx - 1, -1):
+            op = block.ops[idx]
+            if op.type == "fill_constant" or op.type == "sum":
+                if "MERGED" in op.output_arg_names[0]: new_end_idx = idx + 1
+            elif op.type == "cast":
+                if "@TMP" in op.output_arg_names[0]: new_end_idx = idx + 1
+        self._segments[-1]._end_idx = new_end_idx
+
         if self._segments[-1]._allreduce_vars:
             shard_allredue_vars = self._shard.filter_grads(self._segments[-1]
                                                            ._allreduce_vars)
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_sync_comm_ops(block, self._segments[-1]._end_idx,
-                                     self.dp_ring_id, shard_allredue_vars)
-                insert_allreduce_ops(block, self._segments[-1]._end_idx,
-                                     self.dp_ring_id, shard_allredue_vars)
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_sync_comm_ops(block, self._segments[-1]._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+                    insert_allreduce_ops(block, self._segments[-1]._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+            # gradient merge 
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+                self.create_persistable_gradients_and_insert_merge_ops(
+                    block,
+                    self._startup_program.global_block(),
+                    self._segments[-1]._end_idx, shard_allredue_vars,
+                    self._shard)
+
             insert_sync_comm_ops(block, self._segments[-1]._end_idx,
                                  self.sharding_ring_id,
                                  self._segments[-1]._allreduce_vars)
-            insert_allreduce_ops(block, self._segments[-1]._end_idx,
-                                 self.sharding_ring_id,
-                                 self._segments[-1]._allreduce_vars)
+            # allreduce --> reduce 
+            insert_reduce_ops(
+                block,
+                self._segments[-1]._end_idx,
+                self.sharding_ring_id,
+                self._segments[-1]._allreduce_vars,
+                self._shard,
+                op_role=OpRole.Backward,
+                use_calc_stream=False)
 
         for idx, segment in reversed(list(enumerate(self._segments))):
             allreduce_vars = self._segments[
@@ -364,19 +859,32 @@ def _add_broadcast_allreduce(self, block):
 
             # step2: add Sync ops
             shard_allredue_vars = self._shard.filter_grads(allreduce_vars)
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_sync_comm_ops(block, segment._end_idx, self.dp_ring_id,
-                                     shard_allredue_vars)
 
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_sync_comm_ops(block, segment._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+
+                    broad_cast_vars = [x[0] for x in broadcast_vars]
+                    if len(broad_cast_vars) > 0:
+                        insert_sync_comm_ops(block, segment._end_idx,
+                                             self.sharding_ring_id,
+                                             broad_cast_vars)
+                else:
+                    comm_dep_vars = allreduce_vars + [
+                        x[0] for x in broadcast_vars
+                    ]
+                    if len(comm_dep_vars) > 0:
+                        insert_sync_comm_ops(block, segment._end_idx,
+                                             self.sharding_ring_id,
+                                             comm_dep_vars)
+            # gradient merge
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 broad_cast_vars = [x[0] for x in broadcast_vars]
                 if len(broad_cast_vars) > 0:
                     insert_sync_comm_ops(block, segment._end_idx,
                                          self.sharding_ring_id, broad_cast_vars)
-            else:
-                comm_dep_vars = allreduce_vars + [x[0] for x in broadcast_vars]
-                if len(comm_dep_vars) > 0:
-                    insert_sync_comm_ops(block, segment._end_idx,
-                                         self.sharding_ring_id, comm_dep_vars)
 
             calc_dep_vars = fill_constant_vars + [
                 k for k, v in cast_ops.items()
@@ -394,18 +902,41 @@ def _add_broadcast_allreduce(self, block):
             insert_cast_ops(block, segment._end_idx, cast_ops)
 
             # step5: add broadcast ops
+            # gradient merge
+            if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+                self.create_persistable_gradients_and_insert_merge_ops(
+                    block,
+                    self._startup_program.global_block(), segment._start_idx,
+                    shard_allredue_vars, self._shard)
+
             insert_broadcast_ops(block, segment._start_idx,
                                  self.sharding_ring_id, broadcast_vars)
+
             # step6: add all_reduce ops
             # dp
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_allreduce_ops(block, segment._start_idx, self.dp_ring_id,
-                                     shard_allredue_vars)
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_allreduce_ops(block, segment._start_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+                    insert_sync_comm_ops(block, segment._start_idx,
+                                         self.sharding_ring_id, allreduce_vars)
+            # gradient merge
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 insert_sync_comm_ops(block, segment._start_idx,
                                      self.sharding_ring_id, allreduce_vars)
             # sharding
-            insert_allreduce_ops(block, segment._start_idx,
-                                 self.sharding_ring_id, allreduce_vars)
+            # allreduce --> reduce 
+            # TODO temp change
+            if len(allreduce_vars) > 0:
+                insert_reduce_ops(
+                    block,
+                    segment._start_idx,
+                    self.sharding_ring_id,
+                    allreduce_vars,
+                    self._shard,
+                    op_role=OpRole.Backward,
+                    use_calc_stream=False)
 
             block._sync_with_cpp()
 
@@ -456,59 +987,472 @@ def _prune_startup_program(self, block):
             block._remove_var(var_name, sync=False)
         block._sync_with_cpp()
 
-    def _init_comm(self):
-
-        if self.hybrid_dp:
-            self.sharding_group_size = self.user_defined_strategy.sharding_configs[
-                "sharding_group_size"]
-            self.sharding_ring_id = 0
-            self.sharding_rank = self.global_rank % self.sharding_group_size
-
-            self.dp_group_size = self.global_word_size // self.sharding_group_size
-            self.dp_rank = self.global_rank // self.sharding_group_size
-            self.dp_ring_id = self.sharding_rank + 1
-
-            self.sharding_group_endpoints = [
-                ep for idx, ep in enumerate(self.endpoints)
-                if (idx // self.sharding_group_size) == self.dp_rank
-            ]
-            self.dp_group_endpoints = [
-                ep for idx, ep in enumerate(self.endpoints)
-                if (idx % self.sharding_group_size) == self.sharding_rank
+    def _build_groups(self):
+        """
+        pre-assign ring ids
+            mp: 0
+            sharding: 1
+            pure-dp: 2
+            global: 3
+            pp: >= 20
+        if one parallelism is not enable: -1
+        and only support parallelism hierarchy: mp --> sharding --> pp --> dp        
+        """
+        # step 1: initialize nccl
+        self.global_word_size = self.role_maker._worker_num()
+        self.global_rank = self.role_maker._worker_index()
+        self.global_endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.global_endpoints[self.global_rank]
+        self._collective_helper = CollectiveHelper(
+            self.role_maker, nrings=self._nrings_sharding)
+        assert self.global_word_size % self.mp_degree == 0, \
+            "global_word_size: {} should be divisible to the mp_degree: {}".format(self.global_word_size, self.mp_degree)
+        assert self.global_word_size % self.sharding_degree == 0, \
+            "global_word_size: {} should be divisible to the sharding_degree: {}".format(self.global_word_size, self.sharding_degree)
+        assert self.global_word_size % self.pp_degree == 0, \
+            "global_word_size: {} should be divisible to the pp_degree: {}".format(self.global_word_size, self.pp_degree)
+        assert self.global_word_size % self.dp_degree == 0, \
+            "global_word_size: {} should be divisible to the dp_degree: {}".format(self.global_word_size, self.dp_degree)
+
+        # mp group
+        if self.mp_degree > 1:
+            self.mp_ring_id = 0
+            self.mp_rank = self.global_rank % self.mp_degree
+            self.mp_group_id = self.global_rank // self.mp_degree
+            self.mp_group_endpoints = [
+                ep for idx, ep in enumerate(self.global_endpoints)
+                if idx // self.mp_degree == self.mp_group_id
             ]
-            assert self.global_word_size > self.sharding_group_size, \
-                "global_word_size: {} should be larger than sharding_group_size: {}".format(self.global_word_size, self.sharding_group_size)
-            assert self.global_word_size % self.sharding_group_size == 0, \
-                "global_word_size: {} should be divisible to the sharding_group_size: {}".format(self.global_word_size, self.sharding_group_size)
-            assert self.dp_group_size *  self.sharding_group_size == self.global_word_size, \
-                "global_word_size: {} should be equal to the product of sharding_group_size: {} and dp_group_size: {}".format(
-                self.global_word_size,
-                self.sharding_group_size,
-                self.dp_group_size)
-
-            logging.info("Using Sharing&DP mode !")
+            assert self.current_endpoint in self.mp_group_endpoints
+            assert len(
+                self.mp_group_endpoints
+            ) == self.mp_degree, "num of mp worker in group is [{}], but mp group size is [{}]".format(
+                len(self.mp_group_endpoints), self.mp_degree)
+        else:
+            self.mp_degree = 1
+            self.mp_ring_id = -1
+            self.mp_rank = -1
+            self.mp_group_id = -1
+            self.mp_group_endpoints = []
+
+        # sharding 
+        if self.sharding_degree > 1:
+            self.sharding_ring_id = 1
+            self.sharding_rank = (self.global_rank //
+                                  self.mp_degree) % self.sharding_degree
+            self.sharding_group_id = self.global_rank // (self.mp_degree *
+                                                          self.sharding_degree)
+            # mp + sharding + ...
+            if self.mp_degree > 1:
+                self.sharding_group_endpoints = [
+                    ep for idx, ep in enumerate(self.global_endpoints)
+                    if (idx // (self.mp_degree * self.sharding_degree)) == self.
+                    sharding_group_id and idx % self.mp_degree == self.mp_rank
+                ]
+            # sharding + ...    
+            else:
+                self.sharding_group_endpoints = [
+                    ep for idx, ep in enumerate(self.global_endpoints)
+                    if (idx // (self.mp_degree * self.sharding_degree)
+                        ) == self.sharding_group_id
+                ]
+            assert self.current_endpoint in self.sharding_group_endpoints
+        else:
+            self.sharding_degree = 1
+            self.sharding_ring_id = -1
+            self.sharding_rank = -1
+            self.sharding_group_id = -1
+            self.sharding_group_endpoints = []
+
+        # pp
+        if self.pp_degree > 1:
+            self.pp_ring_id = 20
+            self.pp_rank = self.global_rank // (self.sharding_degree *
+                                                self.mp_degree) % self.pp_degree
+            # (NOTE): Already adjust for (outter-pure) dp
+            self.pp_group_id = self.global_rank // (
+                self.mp_degree * self.sharding_degree * self.pp_degree)
+            pp_first_stage_idx = self.global_rank % (
+                self.sharding_degree * self.mp_degree) + self.pp_group_id * (
+                    self.mp_degree * self.sharding_degree * self.pp_degree)
+            pp_stage_offset = self.sharding_degree * self.mp_degree
+            self.pp_group_endpoints = []
+            for i in range(self.pp_degree):
+                self.pp_group_endpoints.append(self.global_endpoints[
+                    pp_first_stage_idx + pp_stage_offset * i])
+            assert self.current_endpoint in self.pp_group_endpoints
+        else:
+            self.pp_degree = 1
+            self.pp_ring_id = -1
+            self.pp_rank = -1
+            self.pp_group_id = -1
+            self.pp_group_endpoints = []
+
+        # outter-pure-dp group
+        # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism
+        # e.g. mp-sharding-pp-dp
+        # sharding-hybrid-dp as one senario of outter-pure-dp 
+        assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format(
+            self.mp_degree, self.sharding_degree, self.pp_degree,
+            self.dp_degree, self.global_word_size)
+
+        if self.dp_degree > 1:
+            self.dp_ring_id = 2
+            self.dp_rank = self.global_rank // (self.sharding_degree *
+                                                self.mp_degree * self.pp_degree)
+            dp_first_rank_idx = self.global_rank % (
+                self.sharding_degree * self.mp_degree * self.pp_degree)
+            dp_offset = (self.sharding_degree * self.mp_degree * self.pp_degree)
+            self.dp_group_endpoints = []
+            for i in range(self.dp_degree):
+                self.dp_group_endpoints.append(self.global_endpoints[
+                    dp_first_rank_idx + dp_offset * i])
+            assert self.current_endpoint in self.dp_group_endpoints
+            logging.info("Hybrid DP mode turn on !")
         else:
-            self.sharding_ring_id = 0
-            self.sharding_rank = self.global_rank
-            self.sharding_group_size = self.role_maker._worker_num()
-            self.sharding_group_endpoints = self.endpoints
             self.dp_ring_id = -1
             self.dp_rank = -1
-            self.dp_group_size = None
-            self.dp_group_endpoints = None
+            self.dp_group_endpoints = []
 
-            logging.info("Using Sharing alone mode !")
+        # global group
+        # use for gen_nccl_comm_sync, amp check nan inf, clip by global norm
+        # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree
+        self.global_ring_id = 3
 
         logging.info("global word size: {}".format(self.global_word_size))
         logging.info("global rank: {}".format(self.global_rank))
-        logging.info("sharding group_size: {}".format(self.sharding_group_size))
+        logging.info("global endpoints: {}".format(self.global_endpoints))
+        logging.info("global ring id: {}".format(self.global_ring_id))
+        logging.info("#####" * 6)
+
+        logging.info("mp group size: {}".format(self.mp_degree))
+        logging.info("mp rank: {}".format(self.mp_rank))
+        logging.info("mp group id: {}".format(self.mp_group_id))
+        logging.info("mp group endpoints: {}".format(self.mp_group_endpoints))
+        logging.info("mp ring id: {}".format(self.mp_ring_id))
+        logging.info("#####" * 6)
+
+        logging.info("sharding group size: {}".format(self.sharding_degree))
         logging.info("sharding rank: {}".format(self.sharding_rank))
-        logging.info("dp group size: {}".format(self.dp_group_size))
-        logging.info("dp rank: {}".format(self.dp_rank))
-        logging.info("current endpoint: {}".format(self.current_endpoint))
+        logging.info("sharding group id: {}".format(self.sharding_group_id))
         logging.info("sharding group endpoints: {}".format(
             self.sharding_group_endpoints))
-        logging.info("dp group endpoints: {}".format(self.dp_group_endpoints))
-        logging.info("global word endpoints: {}".format(self.endpoints))
+        logging.info("sharding ring id: {}".format(self.sharding_ring_id))
+        logging.info("#####" * 6)
+
+        logging.info("pp group size: {}".format(self.pp_degree))
+        logging.info("pp rank: {}".format(self.pp_rank))
+        logging.info("pp group id: {}".format(self.pp_group_id))
+        logging.info("pp group endpoints: {}".format(self.pp_group_endpoints))
+        logging.info("pp ring id: {}".format(self.pp_ring_id))
+        logging.info("#####" * 6)
+
+        logging.info("pure dp group size: {}".format(self.dp_degree))
+        logging.info("pure dp rank: {}".format(self.dp_rank))
+        logging.info("pure dp group endpoints: {}".format(
+            self.dp_group_endpoints))
+        logging.info("pure dp ring id: {}".format(self.dp_ring_id))
+        logging.info("#####" * 6)
 
         return
+
+    def _initialization_broadcast(self, startup_block):
+        """
+        this funtion is to ensure the initialization between dp group to be 
+        identical when hybrid-dp is used.
+        """
+        params = []
+        for param in startup_block.iter_parameters():
+            params.append(param)
+            startup_block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': self.dp_ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+        startup_block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': params},
+            outputs={'Out': params},
+            attrs={'ring_id': self.dp_ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+        # sync within global group
+        append_naive_sync(startup_block, self.startup_prog_sync_var,
+                          self.global_ring_id)
+
+    # sharding gradient merge
+    def create_persistable_gradients_and_insert_merge_ops(
+            self, main_block, startup_block, insert_idx, grad_names, shard):
+
+        for grad_name in grad_names:
+            assert get_grad_device(
+                grad_name, shard
+            ) == shard.worker_idx, "try to merge gradient not belong to current shard: [{}]".format(
+                grad_name)
+            persistable_grad_name = grad_name + '@GradiantMerge'
+            assert grad_name not in self._grad2merged_grad, "grad [{}] already in grad2merged_grad, maybe you meet sharing weight case !".format(
+                grad_name)
+            self._grad2merged_grad[grad_name] = persistable_grad_name
+            grad_var = main_block.var(grad_name)
+            # create var
+            gradient_merge_var = main_block.create_var(
+                name=persistable_grad_name,
+                shape=grad_var.shape,
+                dtype=grad_var.dtype,
+                persistable=True)
+            startup_gradient_merge_var = startup_block.create_var(
+                name=persistable_grad_name,
+                shape=grad_var.shape,
+                dtype=grad_var.dtype,
+                persistable=True)
+
+            # merge gradient
+            main_block._insert_op_without_sync(
+                insert_idx,
+                type="elementwise_add",
+                inputs={'X': grad_name,
+                        'Y': gradient_merge_var},
+                outputs={'Out': gradient_merge_var},
+                attrs={
+                    'axis': -1,
+                    'use_mkldnn': False,
+                    OP_ROLE_KEY: OpRole.Backward
+                })
+
+            # startup initialization
+            startup_block.append_op(
+                type="fill_constant",
+                outputs={"Out": startup_gradient_merge_var},
+                attrs={
+                    "shape": grad_var.shape,
+                    "dtype": grad_var.dtype,
+                    "value": float(0),
+                })
+
+        main_block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+    def _create_gm_cond(self, main_block):
+        # Add const var
+        acc_step_var = layers.create_global_var(
+            name="gradient_merge_acc_step",
+            shape=[1],
+            value=int(self._gradient_merge_acc_step),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        zero_var = layers.create_global_var(
+            name="gradient_merge_zero",
+            shape=[1],
+            value=int(0),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        # Add step var & cond var
+        current_step_var = layers.create_global_var(
+            name="gradient_merge_current_step",
+            shape=[1],
+            value=int(0),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        cond_var = layers.create_global_var(
+            name="gradient_merge_cond",
+            shape=[1],
+            value=bool(0),
+            dtype='bool',
+            persistable=False,
+            force_cpu=True)
+
+        with device_guard("cpu"):
+            # step_var = (step_var + 1) % k_step
+            main_block.append_op(
+                type='increment',
+                inputs={'X': [current_step_var]},
+                outputs={'Out': [current_step_var]},
+                attrs={'step': float(1),
+                       OP_ROLE_KEY: OpRole.Optimize})
+
+            main_block.append_op(
+                type='elementwise_mod',
+                inputs={'X': current_step_var,
+                        'Y': acc_step_var},
+                outputs={'Out': current_step_var},
+                attrs={
+                    'axis': -1,
+                    OP_ROLE_KEY: OpRole.Optimize,
+                    'use_mkldnn': False
+                })
+
+            # cond_var = (step_var == 0)
+            main_block.append_op(
+                type='equal',
+                inputs={'X': current_step_var,
+                        'Y': zero_var},
+                outputs={'Out': cond_var},
+                attrs={OP_ROLE_KEY: OpRole.Optimize})
+        # paddle.static.Print(current_step_var, message="in FWBW last conditional")
+        return cond_var
+
+    def _true_apply_gradient(self):
+        """
+        allreduce grad@gradientmerge in dp group
+        grad@gradientmerge / acc_step
+        re-create all optimize ops of origin main block and rename them
+            cast(backward)
+            amp 
+            clip
+            opt
+        # fill constant grad@gradientmerge
+
+        """
+        # current conditional block
+        main_block = self._main_program.global_block()
+        cur_block_idx = self._main_program.current_block_idx
+        cur_block = self._main_program.current_block()
+        self.cond_block = self._main_program.current_block()
+
+        # cur_block's forward_block & backward_block is itself
+        cur_block._set_forward_block_idx(cur_block_idx)
+
+        # allreduce grad@gradientmerge  
+        if self.hybrid_dp:
+            assert self.dp_ring_id >= 0, "dp_ring_id should larger than 0 when in sharding&DP mode"
+            for grad, merged_grad in self._grad2merged_grad.items():
+                merged_grad_var = main_block.var(merged_grad)
+                cur_block.append_op(
+                    type='c_allreduce_sum',
+                    inputs={'X': merged_grad_var},
+                    outputs={'Out': merged_grad_var},
+                    attrs={
+                        'ring_id': self.dp_ring_id,
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize
+                    })
+
+        # grad@gradientmerge / acc_step
+        for grad, merged_grad in self._grad2merged_grad.items():
+            # grad /= k_steps
+            merged_grad_var = main_block.var(merged_grad)
+            cur_block.append_op(
+                type='scale',
+                inputs={'X': merged_grad_var},
+                outputs={'Out': merged_grad_var},
+                attrs={
+                    'scale': 1.0 / float(self._gradient_merge_acc_step),
+                    'bias': 0.0,
+                    'bias_after_scale': False,
+                    OP_ROLE_KEY: OpRole.Optimize
+                })
+
+        # re-create optimize ops
+        already_moved_var_names = []
+        for op_desc in self.original_optimize_ops_desc:
+            new_op_desc = cur_block.desc.append_op()
+            new_op_desc.copy_from(op_desc)
+
+            for input_name in new_op_desc.input_arg_names():
+                if input_name in self._grad2merged_grad:
+                    new_op_desc._rename_input(
+                        input_name, self._grad2merged_grad[input_name])
+
+            for output_name in new_op_desc.output_arg_names():
+                if output_name in self._grad2merged_grad:
+                    new_op_desc._rename_output(
+                        output_name, self._grad2merged_grad[output_name])
+
+                # move non temp optimize vars from block0 to cond block
+                if output_name not in already_moved_var_names and output_name not in self._grad2merged_grad.keys(
+                ):
+                    var_ = self._main_program.global_block().var(output_name)
+                    if not var_.persistable:
+                        # move
+                        name_ = var_.name
+                        shape_ = var_.shape
+                        type_ = var_.dtype
+                        self._main_program.global_block()._remove_var(
+                            var_.name, sync=False)
+                        self.cond_block.create_var(
+                            name=name_,
+                            shape=shape_,
+                            dtype=type_,
+                            persistable=False)
+                        already_moved_var_names.append(name_)
+
+        self._main_program.global_block()._sync_with_cpp()
+        cur_block._sync_with_cpp()
+
+        # fill zero to grad@gradientmerge
+        for grad, merged_grad in self._grad2merged_grad.items():
+            merged_grad_var = main_block.var(merged_grad)
+            cur_block.append_op(
+                type='fill_constant',
+                outputs={'Out': merged_grad_var},
+                attrs={
+                    "shape": merged_grad_var.shape,
+                    "dtype": merged_grad_var.dtype,
+                    "value": float(0),
+                    OP_ROLE_KEY: OpRole.Optimize
+                })
+
+        # lr_var = main_block.var("gradient_merge_current_step")
+        # paddle.static.Print(lr_var, message="in OPTIMIZE last conditional")
+
+    def _sharding_gradient_merge(self, main_block):
+        """
+        copy all optimize ops in origin main block
+        remove all optimize ops in origin main block
+        create cond block
+
+        """
+        # copy original optimize ops to temp ops desc list
+        # remove them from block 0
+        tmp_copy_block = self._main_program._create_block()
+
+        self.original_optimize_ops_desc = []
+        for op_idx, op in reversed(list(enumerate(main_block.ops))):
+            if int(op.attr('op_role')) != int(OpRole.Optimize):
+                continue
+            else:
+                tmp_op_desc = tmp_copy_block.desc.append_op()
+                tmp_op_desc.copy_from(op.desc)
+                self.original_optimize_ops_desc.append(tmp_op_desc)
+                main_block._remove_op(op_idx, sync=False)
+        tmp_copy_block._sync_with_cpp()
+        self.original_optimize_ops_desc = list(
+            reversed(self.original_optimize_ops_desc))
+
+        # back to block 0
+        self._main_program._rollback()
+
+        # create cond vars and ops at the end of block 0
+        cond = self._create_gm_cond(main_block)
+
+        # create cond block
+        cond_block = self._main_program._create_block()
+        self._true_apply_gradient()
+
+        # back to block 0
+        self._main_program._rollback()
+
+        # cond op
+        step_scope = self._main_program.global_block().create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        conditional_block_op = self._main_program.global_block().append_op(
+            type='conditional_block',
+            inputs={
+                'Cond': cond,
+                'Input': [],
+            },
+            outputs={'Out': [],
+                     'Scope': [step_scope]},
+            attrs={
+                'sub_block': cond_block,
+                'is_scalar_condition': True,
+            })
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
new file mode 100644
index 00000000000000..977954fc257dc6
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mp_utils import *
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py b/python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py
new file mode 100644
index 00000000000000..a7da28700bceb0
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .layers import *
+from .random import *
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py b/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py
new file mode 100644
index 00000000000000..b7512afd9a6de8
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py
@@ -0,0 +1,190 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.dygraph.layers import Layer
+from .random import get_rng_state_tracker
+from paddle.nn import functional as F
+from paddle import framework
+from ...base import topology as tp
+from .layers_help import identity_in_model_parallel, gather_in_model_parallel, reduce_in_model_parallel, scatter_in_model_parallel
+
+__all__ = [
+    'VocabParallelEmbedding', 'ColumnParallelLinear', 'RowParallelLinear'
+]
+
+# Follow this paper to achieve the file:
+# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
+# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
+
+
+class VocabParallelEmbedding(Layer):
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 weight_attr=None,
+                 name=None):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+
+        self.origin_num_embeddings = num_embeddings
+
+        per_part_size = (
+            num_embeddings + self.world_size - 1) // self.world_size
+        last_part_size = num_embeddings - per_part_size * (self.world_size - 1)
+        if self.rank == self.world_size - 1:
+            per_part_size = last_part_size
+        per_part_size += 1  # make the last row as the padding index
+        self.per_part_size = per_part_size
+
+        self.embedding = paddle.nn.Embedding(
+            per_part_size,
+            embedding_dim,
+            padding_idx=per_part_size - 1,
+            sparse=False,
+            weight_attr=weight_attr,
+            name=name)
+        self.embedding.weight.is_distributed = True
+
+    def forward(self, x):
+        origin_input_shape = x.shape
+        if len(origin_input_shape) == 2:
+            x = paddle.unsqueeze(x, axis=-1)
+        else:
+            assert origin_input_shape[-1] == 1, (
+                "The last dimension size of x must be 1.")
+        x_shard = paddle.shard_index(x, self.origin_num_embeddings,
+                                     self.world_size, self.rank,
+                                     self.per_part_size - 1)
+        if len(origin_input_shape) == 2:
+            x_shard = paddle.squeeze(x_shard, axis=-1)
+
+        emb_out_ = self.embedding(x_shard)
+        emb_out = reduce_in_model_parallel(emb_out_)
+        return emb_out
+
+
+class ColumnParallelLinear(Layer):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=None,
+                 gather_output=True,
+                 name=None):
+        super(ColumnParallelLinear, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+
+        self.name = name
+        self.gather_output = gather_output
+        assert out_features % self.world_size == 0, (
+            "Number of column of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(out_features,
+                                                            self.world_size))
+        self.output_size_per_partition = out_features // self.world_size
+
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+
+        self.weight = self.create_parameter(
+            shape=[in_features, self.output_size_per_partition],
+            attr=self._weight_attr,
+            dtype=self._dtype)
+        self.weight.is_distributed = True
+
+        if has_bias:
+            # initialize bias to zero like Megatron
+            self.bias = self.create_parameter(
+                shape=[self.output_size_per_partition],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype)
+            self.bias.is_distributed = True
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        input_parallel = identity_in_model_parallel(x)
+        output_parallel = F.linear(
+            input_parallel, self.weight, self.bias, name=self.name)
+        if self.gather_output:
+            output = gather_in_model_parallel(output_parallel)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(Layer):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=True,
+                 input_is_parallel=False,
+                 name=None):
+        super(RowParallelLinear, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.input_is_parallel = input_is_parallel
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+        self.name = name
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+
+        assert in_features % self.world_size == 0, (
+            "Number of row of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(in_features,
+                                                            self.world_size))
+
+        self.input_size_per_partition = in_features // self.world_size
+
+        self.weight = self.create_parameter(
+            shape=[self.input_size_per_partition, self.out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype)
+        self.weight.is_distributed = True
+
+        if has_bias:
+            self.bias = self.create_parameter(
+                shape=[self.out_features],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype)
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        if self.input_is_parallel:
+            input_parallel = x
+        else:
+            # split last dim
+            input_parallel = scatter_in_model_parallel(x)
+
+        output_parallel = F.linear(input_parallel, self.weight, name=self.name)
+        output_ = reduce_in_model_parallel(output_parallel)
+        output = output_ + self.bias if self.bias is not None else output_
+        return output
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py b/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py
new file mode 100644
index 00000000000000..e32db686efd444
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py
@@ -0,0 +1,116 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.autograd import PyLayer
+from ...base import topology as tp
+import paddle
+
+# Follow this paper to achieve the file:
+# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
+# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
+
+
+def mp_reduce(x):
+    if tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size() == 1:
+        return x
+
+    paddle.distributed.all_reduce(
+        x, group=tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group())
+
+    return x
+
+
+def mp_split(x):
+    world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size()
+
+    if world_size == 1:
+        return x
+
+    rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+    last_dim = len(x.shape) - 1
+    input_list = paddle.split(x, num_or_sections=world_size, axis=last_dim)
+    output = input_list[rank]
+
+    return output
+
+
+def mp_gather(x):
+    world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size()
+
+    if world_size == 1:
+        return x
+
+    output = []
+    paddle.distributed.all_gather(
+        output, x, group=tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group())
+
+    output = paddle.concat(output, axis=len(x.shape) - 1)
+
+    return output
+
+
+class _IdentityInModelParallel(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        return x
+
+    @staticmethod
+    def backward(ctx, dx):
+        return mp_reduce(dx)
+
+
+class _ReduceInModelParallel(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        return mp_reduce(x)
+
+    @staticmethod
+    def backward(ctx, dx):
+        return dx
+
+
+class _ScatterInModelParallel(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        return mp_split(x)
+
+    @staticmethod
+    def backward(ctx, dx):
+        return mp_gather(dx)
+
+
+class _GatherInModelParallel(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        return mp_gather(x)
+
+    @staticmethod
+    def backward(ctx, dx):
+        return mp_split(dx)
+
+
+def identity_in_model_parallel(x):
+    return _IdentityInModelParallel.apply(x)
+
+
+def reduce_in_model_parallel(x):
+    return _ReduceInModelParallel.apply(x)
+
+
+def scatter_in_model_parallel(x):
+    return _ScatterInModelParallel.apply(x)
+
+
+def gather_in_model_parallel(x):
+    return _GatherInModelParallel.apply(x)
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/random.py b/python/paddle/distributed/fleet/meta_parallel/mp_utils/random.py
new file mode 100644
index 00000000000000..56c741dbd3cad5
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/mp_utils/random.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import contextlib
+__all__ = [
+    'RNGStatesTracker', 'model_parallel_random_seed', 'get_rng_state_tracker'
+]
+
+MODEL_PARALLEL_RNG = 'model_parallel_rng'
+
+
+class RNGStatesTracker:
+    """
+    Tracker the RNG states.
+    """
+
+    def __init__(self):
+        # Map from name to the rng state.
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def reset(self):
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def add(self, name, seed):
+        if seed in self.seeds_:
+            raise ValueError('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        if name in self.states_:
+            raise ValueError('state {} already exists'.format(name))
+        orig_rng_state = paddle.get_cuda_rng_state()
+        paddle.seed(seed)
+        self.states_[name] = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def rng_state(self, name=MODEL_PARALLEL_RNG):
+        if name not in self.states_:
+            raise ValueError('state {} does not exist'.format(name))
+        orig_cuda_rng_state = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(self.states_[name])
+        try:
+            yield
+        finally:
+            self.states_[name] = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+RNG_STATE_TRACKER = RNGStatesTracker()
+
+
+def get_rng_state_tracker():
+    return RNG_STATE_TRACKER
+
+
+def model_parallel_random_seed(seed=2048):
+    import paddle.distributed.fleet as fleet
+    hcg = fleet.get_hybrid_communicate_group()
+    rank = hcg.get_model_parallel_rank()
+
+    local_seed = seed + 1024 + rank
+    global_seed = seed
+
+    RNG_STATE_TRACKER.reset()
+    paddle.seed(global_seed)
+    RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index a56868060055e8..df07a7a6e77835 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -453,6 +453,17 @@ def _init_worker(self):
         worker = self._get_fleet_proto(is_server=False, is_sync=is_sync)
         server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
 
+        dist_strategy = self.context["valid_strategy"]
+        use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
+        if use_ps_gpu:
+            main_program = self.context['loss'].block.program
+            if not main_program._fleet_opt:
+                main_program._fleet_opt = {}
+            main_program._fleet_opt["use_ps_gpu"] = True
+            gpus_env = os.getenv("FLAGS_selected_gpus")
+            main_program._fleet_opt[
+                "worker_places"] = [int(s) for s in gpus_env.split(",")]
+
         def sync_strategy_envs():
             kwargs = {}
             kwargs[
@@ -741,6 +752,11 @@ def _get_tables():
             downpour_server = DownpourServer()
 
             service = Service()
+            dist_strategy = self.context["valid_strategy"]
+            use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
+            if use_ps_gpu:
+                service.server_class = "PsLocalServer"
+                service.client_class = "PsLocalClient"
             downpour_server.set_service_param(service)
 
             tables = _get_tables()
@@ -768,7 +784,7 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
         server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
         proto_txt = str(server)
 
-        debug = bool(os.getenv("PSERVER_DEBUG", "0"))
+        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
         if debug:
             print("server: \n{}".format(proto_txt))
 
diff --git a/python/paddle/distributed/fleet/utils/ps_util.py b/python/paddle/distributed/fleet/utils/ps_util.py
index a409d02c984cf2..7bf7bec43de008 100644
--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
@@ -16,6 +16,7 @@
 import numpy as np
 import os
 import paddle
+import warnings
 
 
 class DistributedInfer:
@@ -104,8 +105,6 @@ def _init_dense_params(self, exe=None, dirname=None):
                 vars=need_load_vars)
 
     def get_dist_infer_program(self):
-        import paddle.distributed.fleet as fleet
-
         varname2tables = self._get_sparse_table_map()
         convert_program = self._convert_program(self.origin_main_program,
                                                 varname2tables)
@@ -185,6 +184,7 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
                                 "is_distributed": is_distributed,
                                 "padding_idx": padding_idx,
                                 "table_id": table_id,
+                                "is_test": True,
                                 "lookup_table_version": op_type
                             })
                     else:
@@ -193,6 +193,9 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
                         )
 
             pull_sparse_ops = _get_pull_sparse_ops(program)
+            warnings.warn(
+                "lookup_table will be forced to test mode when use DistributedInfer"
+            )
             _pull_sparse_fuse(program, pull_sparse_ops)
             return program
 
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 56e59ac88efee7..bf49604a897e5b 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -303,8 +303,8 @@ def _throw_exception(self, error_index):
                 raise Exception("Process %d terminated with signal %s." %
                                 (error_index, name))
             else:
-                raise Exception("Process %d terminated with exit code %d." & (
-                    error_index, exitcode))
+                raise Exception("Process %d terminated with exit code %d." %
+                                (error_index, exitcode))
 
         original_trace = self.error_queues[error_index].get()
         msg = "\n\n----------------------------------------------\n" \
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index ae3418687853b9..6dd1478dc1f45f 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -68,7 +68,8 @@
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
+from .core import LoDTensor, LoDTensorArray, Scope, _Scope
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
 from .incubate import fleet
 from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
@@ -124,6 +125,7 @@
         'XPUPlace',
         'CUDAPlace',
         'CUDAPinnedPlace',
+        'NPUPlace',
         'Tensor',
         'ParamAttr',
         'WeightNormParamAttr',
@@ -232,6 +234,16 @@ def __bootstrap__():
             'gpu_memory_limit_mb',
             'conv2d_disable_cudnn',
         ]
+
+    if core.is_compiled_with_npu():
+        read_env_flags += [
+            'selected_npus',
+            'fraction_of_gpu_memory_to_use',
+            'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb',
+            'gpu_memory_limit_mb',
+        ]
+
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
     # don't init_p2p when in unittest to save time.
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
old mode 100644
new mode 100755
index 33e2e387a82758..572ebb26d73cb4
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -115,7 +115,7 @@ def is_amp_cast(op):
         updated_min_idx = min_idx
         while idx_ > pre_segment_end_idx:
             if is_amp_cast(self.ops[idx_]):
-                _logger.debug("found amp-cast op: {}, : {}".format(self.ops[
+                _logger.info("found amp-cast op: {}, : {}".format(self.ops[
                     idx_].desc.type(), self.ops[idx_].desc.input_arg_names()[
                         0]))
                 updated_min_idx = idx_
@@ -155,7 +155,7 @@ def sort_checkpoints(self, checkpoints_name):
         sorted_checkpoints = []
         for name in checkpoints_name:
             if name not in self.var_op_deps:
-                _logger.debug(
+                _logger.info(
                     "Recompute Optimizer: deleted %s from checkpoints, because it is not used in paddle program."
                     % name)
             elif self.var_op_deps[name]["var_as_output_ops"] == []:
@@ -233,6 +233,8 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(desc)
             new_op_desc._set_attr(op_role_attr_name, backward)
+            if desc.has_attr('op_device'):
+                new_op_desc._set_attr('op_device', desc.attr('op_device'))
             result_descs.append(new_op_desc)
     return result_descs
 
@@ -252,6 +254,8 @@ def _add_descs_to_block(descs, block):
         new_op_desc = block.desc.append_op()
         new_op_desc.copy_from(desc)
         new_op_desc._set_attr(op_role_attr_name, backward)
+        if desc.has_attr('op_device'):
+            new_op_desc._set_attr('op_device', desc.attr('op_device'))
         result_descs.append(new_op_desc)
     return result_descs
 
@@ -784,7 +788,6 @@ def _append_backward_ops_with_checkpoints_(
         start_idx = 0
         pre_segment_end_idx = -1
         while True:
-            _logger.debug("FW op range[0] - [{}]".format(len(ops)))
             if start_idx >= len(checkpoints_name) - 1:
                 break
             # min_idx: checkpoint_1' s input op
@@ -797,6 +800,9 @@ def _append_backward_ops_with_checkpoints_(
                 min_idx = program_stat._update_segment_start(
                     min_idx, pre_segment_end_idx)
                 segments.append([min_idx, max_idx + 1])
+            else:
+                _logger.info("Could not recompute op range [{}] - [{}] ".format(
+                    min_idx, max_idx + 1))
 
             start_idx += 1
 
@@ -806,15 +812,15 @@ def _append_backward_ops_with_checkpoints_(
         recompute_segments = segments
 
     for i, (idx1, idx2) in enumerate(recompute_segments):
-        _logger.debug("recompute segment[{}]".format(i))
-        _logger.debug("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
+        _logger.info("recompute segment[{}]".format(i))
+        _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
         ), ops[idx1].desc.input_arg_names()))
-        _logger.debug("segment end op: [{}]: [{}]".format(ops[
+        _logger.info("segment end op: [{}]: [{}]".format(ops[
             idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
-        _logger.debug("recompute segment[{}]".format(i))
-        _logger.debug("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
+        _logger.info("recompute segment[{}]".format(i))
+        _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
         ), ops[idx1].desc.input_arg_names()))
-        _logger.debug("segment end op: [{}]: [{}]".format(ops[
+        _logger.info("segment end op: [{}]: [{}]".format(ops[
             idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
 
     # 2) go through all forward ops and induct all variables that will be hold in memory
@@ -825,9 +831,7 @@ def _append_backward_ops_with_checkpoints_(
             program_stat.get_out_of_subgraph_vars(segment[0], segment[1]))
 
     cross_vars = set(vars_should_be_hold) - set(checkpoints_name)
-    _logger.debug("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
-    len(cross_vars), cross_vars))
-    _logger.debug("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
+    _logger.info("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
     len(cross_vars), cross_vars))
 
     # b. output of seed op should be kept in memory
@@ -843,6 +847,7 @@ def _append_backward_ops_with_checkpoints_(
     vars_in_memory = vars_should_be_hold + checkpoints_name
 
     max_calculated_op_position = len(ops)
+    device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
     if recompute_segments == []:
         gap_ops = ops[0:max_calculated_op_position]
         for op in reversed(gap_ops):
@@ -852,6 +857,11 @@ def _append_backward_ops_with_checkpoints_(
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+            # Set device for grad_op according to forward Op
+            if op.desc.has_attr(device_attr_name):
+                op_device = op.desc.attr(device_attr_name)
+                for op_desc in grad_op_desc:
+                    op_desc._set_attr(device_attr_name, op_device)
             added_descs = _add_descs_to_block(grad_op_desc, local_block)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
@@ -866,6 +876,11 @@ def _append_backward_ops_with_checkpoints_(
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+            # Set device for grad_op according to forward Op
+            if op.desc.has_attr(device_attr_name):
+                op_device = op.desc.attr(device_attr_name)
+                for op_desc in grad_op_desc:
+                    op_desc._set_attr(device_attr_name, op_device)
             added_descs = _add_descs_to_block(grad_op_desc, local_block)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
@@ -888,6 +903,17 @@ def _append_backward_ops_with_checkpoints_(
                     continue
                 if name not in var_name_dict:
                     var_name_dict[name] = name + var_suffix
+
+                    # we should create the rename var in subprog, otherwise its VarType will be BOOL
+                    ref_var = block.program.global_block().var(name)
+                    block.create_var(
+                        name=var_name_dict[name],
+                        shape=ref_var.shape,
+                        dtype=ref_var.dtype,
+                        type=ref_var.type,
+                        persistable=ref_var.persistable,
+                        stop_gradient=ref_var.stop_gradient)
+
         # 3.a. add ops in current recompute_segment as forward recomputation ops
         buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block,
                                                   vars_in_memory)
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 6a524af4ee240f..f940f6a3143a09 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import copy
+from ... import core
 
 __all__ = ["CustomOpLists", "AutoMixedPrecisionLists"]
 
@@ -147,147 +148,10 @@ def _update_list(self):
 }
 
 # The set of ops that don't support fp16 calculation
-unsupported_fp16_list = {
-    # from python/paddle/fluid/layers/io.py
-    'send',
-    'send_barrier',
-    'recv',
-    'fetch_barrier',
-    'create_py_reader',
-    'create_double_buffer_reader',
-    'read',
-    'load',
-
-    # from python/paddle/fluid/control_flow.py
-    'increment',
-    'less_than',
-    'less_equal',
-    'greater_than',
-    'greater_equal',
-    'equal',
-    'not_equal',
-    'read_from_array',
-    'shrink_rnn_memory',
-    'lod_array_length',
-    'logical_and',
-    'logical_or',
-    'logical_xor',
-    'logical_not',
-    'print',
-    'conditional_block',
-    'while',
-    'ifelse',
-    'is_empty',
-    'lstm',
-    'cudnn_lstm',
-    'lstmp',
-    'gru',
-    'gru_unit',
-    'linear_chain_crf',
-    'crf_decoding',
-    'bpr_loss',
-    'chunk_eval',
-    'sequence_conv',
-    'sequence_softmax',
-    # Depthwise conv2d isn't fast and safe currently.
-    # ref: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h#L79
-    'depthwise_conv2d',
-    # Tensor Core kernels are not available for 3D convolutions currently.
-    'conv3d',
-    'sequence_pool',
-    'sequence_concat',
-    'sequence_slice',
-    'data_norm',
-    'group_norm',
-    'spectral_norm',
-    'depthwise_conv2d_transpose',
-    'sequence_expand',
-    'conv_transposed2d',
-    'conv_transposed3d',
-    'sequence_expand_as',
-    'sequence_pad',
-    'sequence_unpad',
-    'sequence_erase',
-    'beam_search',
-    'beam_search_decode',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'reduce_all',
-    'reduce_any',
-    'split',
-    'edit_distance',
-    'ctc_align',
-    'warpctc',
-    'sequence_reshape',
-    'nce',
-    'hierarchical_sigmoid',
-    'im2sequence',
-    'row_conv',
-    'multiplex',
-    'sample_logits',
-    'one_hot',
-    'smooth_l1_loss',
-    'squeeze2',
-    'unsqueeze2',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'pad_constant_like',
-    'label_smooth',
-    'scatter',
-    'sequence_scatter',
-    'random_crop',
-    'mean_iou',
-    'selu',
-    'crop',
-    'affine_grid',
-    'rank_loss',
-    'margin_rank_loss',
-    'pad2d',
-    'elu',
-    'pow',
-    'stanh',
-    'hard_sigmoid',
-    'swish',
-    'prelu',
-    'brelu',
-    'sequence_enumerate',
-    'sequence_mask',
-    'expand',
-    'sampling_id',
-    'maxout',
-    'space_to_depth',
-    'sequence_reverse',
-    'similarity_focus',
-    'hash',
-    'grid_sampler',
-    'log_loss',
-    'teacher_student_sigmoid_loss',
-    'add_position_encoding',
-    'bilinear_tensor_product',
-    'shuffle_channel',
-    'temporal_shift',
-    'psroi_pool',
-    'huber_loss',
-    'kldiv_loss',
-    'tree_conv',
-    'pixel_shuffle',
-    'fsp',
-    'cvm',
-    'affine_channel',
-    'roi_pool',
-    'roi_align',
-    'anchor_generator',
-    'generate_proposals',
-    'generate_proposal_labels',
-    'generate_mask_labels',
-    # fp16 is slower than fp32, though fp16 is supported.
-    'lookup_table',
-    'lookup_table_v2',
-}
+# lookup_table fp16 is slower than fp32, though fp16 is supported.
+_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
+    'GPU', core.VarDesc.VarType.FP16)
+unsupported_fp16_list = {'lookup_table',
+                         'lookup_table_v2'} | _sys_unsupported_fp16_list
 
 CustomOpLists = AutoMixedPrecisionLists
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 67e83a2ec4617c..cf35963fe2b261 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -116,7 +116,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         persistable=False,
                         stop_gradient=in_var.stop_gradient)
 
-                    block._insert_op(
+                    block._insert_op_without_sync(
                         idx,
                         type="cast",
                         inputs={"X": in_var},
@@ -490,6 +490,7 @@ def rewrite_program(main_prog, amp_lists):
         main_prog (Program): The main program for training.
     """
     block = main_prog.global_block()
+    block._sync_with_cpp()
     ops = block.ops
     white_op_set = set()
     black_op_set = set()
@@ -578,6 +579,7 @@ def update_role_var_grad(main_prog, params_grads):
         params_grads (list): A list of params and grads.
     """
     block = main_prog.global_block()
+    block._sync_with_cpp()
     BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
     OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize
     for p, g in params_grads:
@@ -585,7 +587,7 @@ def update_role_var_grad(main_prog, params_grads):
         if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
             role = op.attr('op_role')
             if role & int(BACKWARD) and op.has_attr('op_role_var'):
-                op.desc.remove_attr("op_role_var")
+                op._remove_attr("op_role_var")
             else:
                 raise ValueError("The cast op {0} must be in BACKWARD role "
                                  "and have op_role_var attr.".format(op))
@@ -610,11 +612,19 @@ def update_role_var_grad(main_prog, params_grads):
                 raise ValueError("The cast op {0}'s output should not be"
                                  "used by a non-optimize op, however, it"
                                  "is used by {1}".format(op, post_ops[0]))
+            #add new op in the python and cpp at the same time 
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(op.desc)
-
+            new_op = framework.Operator(
+                block=block,
+                desc=new_op_desc,
+                type=None,
+                inputs=None,
+                outputs=None,
+                attrs=None)
+            block.ops.append(new_op)
             op_idx = find_op_index(block.desc, op.desc)
             if op_idx == -1:
                 raise ValueError("The op {0} is not in program".format(op))
-            block.desc._remove_op(op_idx, op_idx + 1)
-        block._sync_with_cpp()
+            block._remove_op(op_idx, sync=False)
+    block._sync_with_cpp()
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index f4620ff00013c8..66b11d1f17ad41 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -468,7 +468,7 @@ def _is_target_layer(self, layer):
         """
         Whether the layer needs to calculate output scales.
         """
-        return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \
+        return isinstance(layer, utils.quant_output_layers) \
             or ('quantized' in layer.full_name() and \
                 'quantized_noweight' not in layer.full_name())
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index f45eb8c97f419e..004e1c1aa9bc50 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -43,28 +43,18 @@
     "fake_quantize_dequantize_moving_average_abs_max"
 ]
 
-quant_output_layers_map = {
-    'Conv2D': paddle.nn.Conv2D,
-    'Conv2DTranspose': paddle.nn.Conv2DTranspose,
-    'Linear': paddle.nn.Linear,
-    'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
-    'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D,
-    'AvgPool2D': paddle.nn.AvgPool2D,
-    'MaxPool2D': paddle.nn.MaxPool2D,
-    'BatchNorm': paddle.nn.BatchNorm,
-    'BatchNorm2D': paddle.nn.BatchNorm2D,
-    'SyncBatchNorm': paddle.nn.SyncBatchNorm,
-    'ELU': paddle.nn.ELU,
-    'GELU': paddle.nn.GELU,
-    'LeakyReLU': paddle.nn.LeakyReLU,
-    'PReLU': paddle.nn.PReLU,
-    'ReLU': paddle.nn.ReLU,
-    'ReLU6': paddle.nn.ReLU6,
-    'Sigmoid': paddle.nn.Sigmoid,
-    'Softmax': paddle.nn.Softmax,
-    'Tanh': paddle.nn.Tanh,
-    'Swish': paddle.nn.Swish,
-}
+quant_output_layers = (
+    paddle.nn.Conv2D, paddle.nn.Conv2DTranspose, paddle.nn.Linear,
+    paddle.nn.AdaptiveAvgPool2D, paddle.nn.AdaptiveMaxPool2D,
+    paddle.nn.AvgPool2D, paddle.nn.MaxPool2D, paddle.nn.BatchNorm,
+    paddle.nn.BatchNorm2D, paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm,
+    paddle.nn.ELU, paddle.nn.GELU, paddle.nn.Hardshrink, paddle.nn.Hardsigmoid,
+    paddle.nn.Hardswish, paddle.nn.Hardtanh, paddle.nn.LeakyReLU,
+    paddle.nn.LogSigmoid, paddle.nn.LogSoftmax, paddle.nn.Maxout,
+    paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, paddle.nn.SELU,
+    paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Softplus,
+    paddle.nn.Softshrink, paddle.nn.Softsign, paddle.nn.Swish, paddle.nn.Tanh,
+    paddle.nn.Tanhshrink, paddle.nn.ThresholdedReLU, paddle.nn.Upsample)
 
 weight_op_types = [
     "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index b59534b5965adf..bc2e2dc9b6562c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -16,9 +16,11 @@
 import re
 import logging
 import numpy as np
+import shutil
 from .... import io
 from .... import core
 from .... import framework
+from .... import unique_name
 from ....executor import global_scope, Executor
 from ....framework import IrGraph
 from ....log_helper import get_logger
@@ -53,7 +55,7 @@ def _set_variable_data(scope, place, var_name, np_value):
     Set the value of var node by name, if the node exits,
     '''
     assert isinstance(np_value, np.ndarray), \
-        'The type of value should be numpy array.'
+       'The type of value should be numpy array.'
     var_node = scope.find_var(var_name)
     if var_node != None:
         tensor = var_node.get_tensor()
@@ -136,8 +138,10 @@ def __init__(self,
                  batch_size=10,
                  batch_nums=None,
                  algo="KL",
+                 hist_percent=0.99999,
                  quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
                  is_full_quantize=False,
+                 bias_correction=False,
                  activation_bits=8,
                  weight_bits=8,
                  activation_quantize_type='range_abs_max',
@@ -178,7 +182,13 @@ def __init__(self,
                 get the KL threshold for quantized activations and get the abs_max
                 value for quantized weights. If algo='abs_max', get the abs max 
                 value for activations and weights. If algo= 'min_max', get the min 
-                and max value for quantized activations and weights. Default is KL.
+                and max value for quantized activations and weights. If algo='avg',
+                get the average value among the max values for activations. If 
+                algo= 'hist', get the value of 'hist_percent' quantile as the threshold.
+                If algo='mse', get the value which makes the quantization mse loss 
+                minimal. Default is KL.
+            hist_percent(float, optional): The threshold of algo 'hist' for activations.
+                Default is 0.99999.
             quantizable_op_type(list[str], optional): List the type of ops 
                 that will be quantized. Default is ["conv2d", "depthwise_conv2d", 
                 "mul"].
@@ -186,6 +196,8 @@ def __init__(self,
                 apply quantization to all supported quantizable op type. If set
                 is_full_quantized as False, only apply quantization to the op type 
                 according to the input quantizable_op_type.
+            bias_correction(bool, optional): If set as True, use the bias correction
+                method of https://arxiv.org/abs/1810.05723. Default is False.
             activation_bits(int): quantization bit number for activation.
             weight_bits(int, optional): quantization bit number for weights.
             activation_quantize_type(str): quantization type for activation,
@@ -253,7 +265,9 @@ def __init__(self,
             'range_abs_max', 'moving_average_abs_max', 'abs_max'
         ]
         self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max']
-        self._support_algo_type = ['KL', 'abs_max', 'min_max']
+        self._support_algo_type = [
+            'KL', 'hist', 'avg', 'mse', 'abs_max', 'min_max'
+        ]
         self._dynamic_quantize_op_type = ['lstm']
         self._support_quantize_op_type = \
             list(set(QuantizationTransformPass._supported_quantizable_op_type +
@@ -268,7 +282,7 @@ def __init__(self,
             "cannot be None in the same time."
         assert batch_size > 0, "The batch_size should be greater than 0."
         assert algo in self._support_algo_type, \
-            "The algo should be KL, abs_max or min_max."
+            "The algo should be KL, hist, mse, avg, abs_max or min_max."
         assert activation_quantize_type in self._support_activation_quantize_type, \
             "The activation_quantize_type ({}) should in ({}).".format(
             activation_quantize_type, self._support_activation_quantize_type)
@@ -277,6 +291,7 @@ def __init__(self,
             weight_quantize_type, self._support_weight_quantize_type)
 
         # Save input params
+        self._bias_correction = bias_correction
         self._executor = executor
         self._scope = global_scope() if scope == None else scope
         self._model_dir = model_dir
@@ -287,6 +302,7 @@ def __init__(self,
         self._batch_size = batch_size
         self._batch_nums = batch_nums
         self._algo = algo
+        self._hist_percent = hist_percent
         self._activation_bits = activation_bits
         self._weight_bits = weight_bits
         self._activation_quantize_type = activation_quantize_type
@@ -312,17 +328,21 @@ def __init__(self,
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
         self._weight_op_pairs = {}
-        # The vars for alog = KL
+        # The vars for alog = KL or hist
         self._sampling_act_abs_min_max = {}
         self._sampling_act_histogram = {}
         self._sampling_data = {}
-        self._quantized_var_kl_threshold = {}
+        self._quantized_var_threshold = {}
         self._histogram_bins = 2048
         # The vars for algo = min_max
         self._quantized_var_min = {}
         self._quantized_var_max = {}
-        # The vars for algo = abs_max
-        self._quantized_var_abs_max = {}
+        # The vars for algo = avg
+        self._quantized_var_avg = {}
+        # The best loss of algo = mse
+        self._best_mse_loss = {}
+        # The threshold for algo = abs_max, mse or avg
+        self._quantized_threshold = {}
 
     def quantize(self):
         '''
@@ -339,7 +359,7 @@ def quantize(self):
         self._collect_target_varnames()
         self._set_activation_persistable()
 
-        if self._algo == "KL":
+        if self._algo in ["KL", "hist"]:
             _logger.info("Preparation stage ...")
             batch_id = 0
             for data in self._data_loader():
@@ -372,13 +392,14 @@ def quantize(self):
             if self._batch_nums and batch_id >= self._batch_nums:
                 break
         _logger.info("Finish sampling stage, all batch: " + str(batch_id))
-
         self._reset_activation_persistable()
-
-        if self._algo == "KL":
-            self._calculate_kl_threshold()
-
-        if self._algo in ["KL", "abs_max"]:
+        if self._algo == 'avg':
+            for var_name in self._quantized_act_var_name:
+                self._quantized_threshold[var_name] = \
+                np.array(self._quantized_var_avg[var_name]).mean()
+        if self._algo in ["KL", "hist"]:
+            self._calculate_kl_hist_threshold()
+        if self._algo in ["KL", "abs_max", "hist", "avg", "mse"]:
             self._update_program()
         else:
             self._save_input_threhold()
@@ -524,14 +545,84 @@ def _sampling(self):
         '''
         if self._algo == "abs_max":
             self._sample_abs_max()
+        elif self._algo == "avg":
+            self._sample_avg()
         elif self._algo == "min_max":
             self._sample_min_max()
-        elif self._algo == "KL":
+        elif self._algo == "mse":
+            self._sample_mse()
+        elif self._algo in ["KL", "hist"]:
             self._sample_histogram()
 
+    def _sample_mse(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+        _logger.info("MSE searching stage ...")
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = var_tensor.flatten()
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            s = 0.3
+            if var_name not in self._best_mse_loss:
+                self._best_mse_loss[var_name] = float('inf')
+            while s <= 1.0:
+                scale = s * abs_max_value
+                s += 0.02
+                bins = 2**(self._activation_bits - 1) - 1
+                quant_dequant_var = np.round(
+                    np.clip(var_tensor, 0.0, scale) / scale *
+                    bins) / bins * scale
+                mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
+                if mse_loss <= self._best_mse_loss[var_name]:
+                    self._best_mse_loss[var_name] = mse_loss
+                    self._quantized_threshold[var_name] = scale
+
+    def _sample_avg(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            if (var_name not in self._quantized_var_avg):
+                self._quantized_var_avg[var_name] = []
+            abs_avg_value = float(np.mean(np.max(  \
+            np.abs(var_tensor.reshape(var_tensor.shape[0], -1)), axis=(1))))
+            self._quantized_var_avg[var_name].append(abs_avg_value)
+            continue
+
     def _sample_abs_max(self):
-        # Only calculate abs_max value for weight for once
-        if self._quantized_var_abs_max == {}:
+        if self._quantized_threshold == {}:
             for var_name in self._quantized_weight_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
                 if self._weight_quantize_type == "abs_max":
@@ -547,14 +638,14 @@ def _sample_abs_max(self):
                         for i in range(var_tensor.shape[0]):
                             abs_max_value.append(
                                 float(np.max(np.abs(var_tensor[i]))))
-                self._quantized_var_abs_max[var_name] = abs_max_value
+                self._quantized_threshold[var_name] = abs_max_value
 
         for var_name in self._quantized_act_var_name:
             var_tensor = _load_variable_data(self._scope, var_name)
             abs_max_value = float(np.max(np.abs(var_tensor)))
-            if (var_name not in self._quantized_var_abs_max) or \
-                (abs_max_value > self._quantized_var_abs_max[var_name]):
-                self._quantized_var_abs_max[var_name] = abs_max_value
+            if (var_name not in self._quantized_threshold) or \
+                (abs_max_value > self._quantized_threshold[var_name]):
+                self._quantized_threshold[var_name] = abs_max_value
 
     def _sample_min_max(self):
         if self._quantized_var_min == {} and self._quantized_var_max == {}:
@@ -644,12 +735,12 @@ def _init_sampling_act_histogram(self):
                     [], bins=self._histogram_bins, range=(min_val, max_val))
                 self._sampling_act_histogram[var_name] = [hist, hist_edeges]
 
-    def _calculate_kl_threshold(self):
+    def _calculate_kl_hist_threshold(self):
         '''
-        Calculate the KL threshold of quantized variables.
+        Calculate the KL or hist threshold of quantized variables.
         '''
-        _logger.info("Calculate KL threshold ...")
-        assert self._algo == "KL", "The algo should be KL to calculate kl threshold."
+        _logger.info("Calculate {} threshold ...".format(self._algo))
+        assert self._algo in ["KL", "hist"], "The algo should be KL or hist."
 
         # Abs_max threshold for weights
         for var_name in self._quantized_weight_var_name:
@@ -667,18 +758,22 @@ def _calculate_kl_threshold(self):
                     for i in range(weight_data.shape[0]):
                         weight_threshold.append(
                             float(np.max(np.abs(weight_data[i]))))
-            self._quantized_var_kl_threshold[var_name] = weight_threshold
+            self._quantized_var_threshold[var_name] = weight_threshold
 
         for var_name in self._quantized_act_var_name:
             hist, hist_edeges = self._sampling_act_histogram[var_name]
-            self._quantized_var_kl_threshold[var_name] = \
-                self._get_kl_scaling_factor(hist, hist_edeges)
+            if self._algo == "KL":
+                self._quantized_var_threshold[var_name] = \
+                    self._get_kl_scaling_factor(hist, hist_edeges)
+            elif self._algo == "hist":
+                self._quantized_var_threshold[var_name] = \
+                    self._get_hist_scaling_factor(hist, hist_edeges)
 
     def _update_program(self):
         '''
         Use QuantizationTransformPass and AddQuantDequantPass to insert 
         fake_quantize, fake_dequantize and fake_quant_dequant op. 
-        Besides, save all kl threshold to the scale var node.
+        Besides, save all threshold to the scale var node.
         '''
         _logger.info("Update the program ...")
         graph = IrGraph(core.Graph(self._program.desc), for_test=True)
@@ -709,11 +804,11 @@ def _update_program(self):
             quantizable_op_type=minor_quantizable_op_types)
         add_quant_dequant_pass.apply(graph)
 
-        # save abs_max or KL threshold to scale var node
-        if self._algo == "KL":
-            scale_dict = self._quantized_var_kl_threshold
+        # save threshold to scale var node
+        if self._algo in ["KL", "hist"]:
+            scale_dict = self._quantized_var_threshold
         else:
-            scale_dict = self._quantized_var_abs_max
+            scale_dict = self._quantized_threshold
         for key, val in scale_dict.items():
             _set_variable_data(
                 self._scope,
@@ -732,6 +827,7 @@ def _update_program(self):
         freeze_pass = QuantizationFreezePass(
             scope=self._scope,
             place=self._place,
+            bias_correction=self._bias_correction,
             weight_bits=self._weight_bits,
             activation_bits=self._activation_bits,
             weight_quantize_type=self._weight_quantize_type,
@@ -759,20 +855,28 @@ def analysis_and_save_info(op_node, out_var_name):
                 out_var_name + " is not the output of the op"
             if self._algo == "KL":
                 # For compatibility, we save output threshold by two methods.
-                save_info(op_node, out_var_name,
-                          self._quantized_var_kl_threshold, "out_threshold",
-                          "post_kl")
+                save_info(op_node, out_var_name, self._quantized_var_threshold,
+                          "out_threshold", "post_kl")
                 save_info(
-                    op_node, out_var_name, self._quantized_var_kl_threshold,
+                    op_node, out_var_name, self._quantized_var_threshold,
                     argname_index[0] + str(argname_index[1]) + "_threshold",
                     "post_kl")
-            elif self._algo == "abs_max":
-                save_info(op_node, out_var_name, self._quantized_var_abs_max,
-                          "out_threshold", "post_abs_max")
+            elif self._algo == "hist":
+                # For compatibility, we save output threshold by two methods.
+                save_info(op_node, out_var_name, self._quantized_var_threshold,
+                          "out_threshold", "post_hist")
                 save_info(
-                    op_node, out_var_name, self._quantized_var_abs_max,
+                    op_node, out_var_name, self._quantized_var_threshold,
                     argname_index[0] + str(argname_index[1]) + "_threshold",
-                    "post_kl")
+                    "post_hist")
+
+            elif self._algo in ["avg", "abs_max", "mse"]:
+                save_info(op_node, out_var_name, self._quantized_threshold,
+                          "out_threshold", "post_" + str(self._algo))
+                save_info(
+                    op_node, out_var_name, self._quantized_threshold,
+                    argname_index[0] + str(argname_index[1]) + "_threshold",
+                    "post_" + str(self._algo))
             elif self._algo == "min_max":
                 save_info(op_node, out_var_name, self._quantized_var_min,
                           "out_min", "post_min_max")
@@ -815,10 +919,27 @@ def _collect_dynamic_quantize_op_threshold(self, target_ops_type):
                     op._set_attr("quantization_type", quantization_type)
                     op._set_attr("bit_length", self._weight_bits)
 
-    def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255):
+    def _get_hist_scaling_factor(self, hist, hist_edges):
+        '''
+        Using the hist method to get the scaling factor.
+        '''
+        threshold_rate = self._hist_percent
+        hist = hist / float(sum(hist))
+        hist_sum = 0
+        hist_index = 0
+        for i in range(len(hist)):
+            hist_sum += hist[i]
+            if hist_sum >= threshold_rate:
+                hist_index = i + 1
+                break
+        bin_width = hist_edges[1] - hist_edges[0]
+        return (hist_index - 0.5) * bin_width
+
+    def _get_kl_scaling_factor(self, hist, hist_edeges):
         '''
         Using the KL-divergenc method to get the more precise scaling factor.
         '''
+        num_quantized_bins = 2**(self._activation_bits - 1) - 1
         ending_iter = self._histogram_bins - 1
         starting_iter = int(ending_iter * 0.7)
         bin_width = hist_edeges[1] - hist_edeges[0]
@@ -1006,6 +1127,82 @@ def quantize_weight_to_int(self,
                 quantizable_op_type, weight_bits, weight_quantize_type, True,
                 threshold_rate)
 
+    def convert_weight_to_fp16(self, save_model_dir):
+        """
+        Convert all presistable vars from fp32 to fp16.
+        Note that, this api only changes the data type of variables in
+        __params__ file, and the __model__ file remains unchanged. 
+
+        Args:
+            save_model_dir(str): The path to save the fp16 model.
+        """
+
+        # Load model
+        place = core.CPUPlace()
+        exe = Executor(place)
+        scope = global_scope()
+        [infer_program, feed_list, fetch_list] = \
+            io.load_inference_model(dirname=self._model_dir,
+                                    executor=exe,
+                                    model_filename=self._model_filename,
+                                    params_filename=self._params_filename)
+
+        # Clone and save fp16 weights
+        save_program = framework.Program()
+        save_block = save_program.global_block()
+        save_var_map = {}
+
+        for var in infer_program.list_vars():
+            if (var.type == core.VarDesc.VarType.RAW) or \
+                (not var.persistable) or (var.name in ['feed', 'fetch']) \
+                or (var.dtype != core.VarDesc.VarType.FP32):
+                continue
+
+            #new_var = _clone_var_to_block_(var, save_block)
+            new_var = save_block._clone_variable(var)
+            if self._params_filename is not None:
+                save_var_map[new_var.name] = new_var
+            else:
+                save_file_path = os.path.join(
+                    os.path.normpath(save_model_dir), new_var.name)
+                save_block.append_op(
+                    type='save',
+                    inputs={'X': [new_var]},
+                    outputs={},
+                    attrs={
+                        'file_path': os.path.normpath(save_file_path),
+                        'save_as_fp16': True
+                    })
+
+        if self._params_filename is not None:
+            save_var_list = []
+            for name in sorted(save_var_map.keys()):
+                save_var_list.append(save_var_map[name])
+
+            saved_params_var = save_block.create_var(
+                type=core.VarDesc.VarType.RAW,
+                name=unique_name.generate("saved_params"))
+            saved_params_var.desc.set_persistable(True)
+
+            save_path = os.path.join(
+                os.path.normpath(save_model_dir), self._params_filename)
+            save_block.append_op(
+                type='save_combine',
+                inputs={'X': save_var_list},
+                outputs={'Y': saved_params_var},
+                attrs={'file_path': save_path,
+                       'save_as_fp16': True})
+
+        save_program._sync_with_cpp()
+        exe.run(save_program)
+
+        # Copy model
+        model_filename = "__model__" if self._model_filename is None \
+                    else self._model_filename
+        src_model = os.path.join(self._model_dir, model_filename)
+        dest_model = os.path.join(save_model_dir, model_filename)
+        shutil.copyfile(src_model, dest_model)
+
     def _quantize_weight_to_int(self, save_model_dir, save_model_filename,
                                 save_params_filename, quantizable_op_type,
                                 weight_bits, weight_quantize_type, for_test,
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 3f9ff7295dd6bb..ec215a3e5757ea 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -60,6 +60,7 @@
     "swish",
     "softmax",
     "batch_norm",
+    "layer_norm",
     "elementwise_add",
     "pool2d",
     "reshape2",
@@ -67,6 +68,7 @@
     "concat",
     "elementwise_mul",
     "scale",
+    "slice",
     "hard_swish",
     "hard_sigmoid",
     "conv2d_transpose",
@@ -119,6 +121,7 @@
     "swish": [["X"], ["Out"]],
     "dropout": [["X"], ["Out"]],
     "batch_norm": [["X"], ["Y"]],
+    "layer_norm": [["X"], ["Y"]],
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
@@ -1070,6 +1073,7 @@ class QuantizationFreezePass(object):
     def __init__(self,
                  scope,
                  place,
+                 bias_correction=False,
                  weight_bits=8,
                  activation_bits=8,
                  weight_quantize_type='abs_max',
@@ -1085,6 +1089,8 @@ def __init__(self,
             scope(fluid.Scope): scope is used to get the weight tensor values.
             place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to restore the weight tensors.
                 If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
+            bias_correction(bool): whether use bias correction for post-training quantization.
+                 https://arxiv.org/abs/1810.05723.
             weight_bits(int): quantization bit number for weights.
             activation_bits(int): quantization bit number for activation.
             weight_quantize_type(str): quantization type for weights, support 'abs_max' and 
@@ -1098,6 +1104,7 @@ def __init__(self,
         assert place is not None, \
             'The place cannot be set None.'
         self._scope = scope
+        self._bias_correction = bias_correction
         self._place = _get_paddle_place(place)
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
@@ -1154,7 +1161,10 @@ def apply(self, graph):
                     else:
                         quant_axis = 0
                     quantized_param_v = self._quant(
-                        param_v, scale_v, self._weight_bits, quant_axis)
+                        param_v.copy(), scale_v, self._weight_bits, quant_axis)
+                    if self._bias_correction == True:
+                        quantized_param_v = self._bias_correction_w(
+                            param_v, quantized_param_v, scale_v, quant_axis)
                     self._restore_var(input_arg_name, quantized_param_v)
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
 
@@ -1373,6 +1383,8 @@ def _clip(x, scale):
 
         if isinstance(scale, list):
             for i, s in enumerate(scale):
+                if s == 0.0:
+                    s = 1e-8
                 if quant_axis == 0:
                     x[i] = _clip(x[i], s)
                     x[i] = np.round(x[i] / s * bnt)
@@ -1384,6 +1396,46 @@ def _clip(x, scale):
             x = np.round(x / scale * bnt)
         return x
 
+    def _bias_correction_w(self, x, x_quant, scale_v, quant_axis):
+        '''
+        Bias correction for weight
+        '''
+        eps = 1e-8
+        bnt = (1 << (self._weight_bits - 1)) - 1
+        x_dequant = x_quant.copy()
+        if isinstance(scale_v, list):
+            if quant_axis == 0:
+                for i, s in enumerate(scale_v):
+                    x_dequant[i] = x_dequant[i] * s / bnt
+                quant_bias = x - x_dequant
+                mean_bias = quant_bias.reshape(quant_bias.shape[0], -1).mean(-1)
+                std_orig = x.reshape(x.shape[0], -1).std(-1)
+                std_quant = x_dequant.reshape(x_dequant.shape[0], -1).std(-1)
+                std_bias = std_orig / (std_quant + eps)
+            else:
+                for i, s in enumerate(scale_v):
+                    x_dequant[:, i] = x_quant[:, i] * s / bnt
+                quant_bias = x - x_dequant
+                mean_bias = np.array([
+                    quant_bias[:, i].mean() for i in range(quant_bias.shape[1])
+                ])
+                std_orig = np.array([x[:, i].std() for i in range(x.shape[1])])
+                std_quant = np.array(
+                    [x_dequant[:, i].std() for i in range(x_dequant.shape[1])])
+                std_bias = std_orig / (std_quant + eps)
+        else:
+            x_dequant = x_quant * scale_v / bnt
+            mean_bias = (x - x_dequant).mean()
+            std_bias = x.std() / (x_dequant.std() + eps)
+        if mean_bias.ndim == 1:
+            std_bias = np.resize(std_bias, x.shape)
+            mean_bias = np.resize(mean_bias, x.shape)
+
+        x_dequant = (mean_bias + x_dequant) * std_bias
+        quantized_param_v = self._quant(x_dequant, scale_v, self._weight_bits,
+                                        quant_axis)
+        return quantized_param_v
+
 
 class ConvertToInt8Pass(object):
     def __init__(self, scope, place, quantizable_op_type=None):
@@ -1700,7 +1752,7 @@ class AddQuantDequantPass(object):
         "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
         "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6",
         "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2",
-        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm"
+        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm"
     ]
 
     # To be compatible with PaddleSlim, not remove _activation_type for now
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 96b3b67103b81a..99a23525409f37 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -17,6 +17,8 @@
 import os
 import numpy as np
 import random
+import shutil
+import time
 import unittest
 import logging
 import paddle
@@ -157,6 +159,20 @@ class TestImperativeQat(unittest.TestCase):
     QAT = quantization-aware training
     """
 
+    @classmethod
+    def setUpClass(cls):
+        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        cls.root_path = os.path.join(os.getcwd(), "imperative_qat_" + timestamp)
+        cls.save_path = os.path.join(cls.root_path, "lenet")
+        cls.dynamic_root_path = os.path.join(os.getcwd(),
+                                             "dynamic_mnist_" + timestamp)
+        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.root_path)
+        shutil.rmtree(cls.dynamic_root_path)
+
     def test_qat_save(self):
         imperative_qat = ImperativeQuantAware(
             weight_quantize_type='abs_max',
@@ -206,6 +222,8 @@ def test_qat_save(self):
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".
                             format(epoch, batch_id,
                                    avg_loss.numpy(), acc.numpy()))
+                    if batch_id == 500:  # For shortening CI time
+                        break
 
                 lenet.eval()
                 for batch_id, data in enumerate(test_reader()):
@@ -242,11 +260,9 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./qat_infer_model/lenet"
-        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            path=path,
+            path=TestImperativeQat.save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -259,7 +275,7 @@ def test_qat_save(self):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=save_dir,
+             dirname=TestImperativeQat.root_path,
              executor=exe,
              model_filename="lenet" + INFER_MODEL_SUFFIX,
              params_filename="lenet" + INFER_PARAMS_SUFFIX)
@@ -351,7 +367,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
 
         paddle.jit.save(
             layer=lenet,
-            path="./dynamic_mnist/model",
+            path=TestImperativeQat.dynamic_save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
index d76e4825e0d622..f5b3e89ef415c1 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
@@ -17,6 +17,8 @@
 import os
 import numpy as np
 import random
+import shutil
+import time
 import unittest
 import logging
 import paddle
@@ -185,6 +187,21 @@ def forward(self, inputs):
 
 
 class TestImperativeAddQuantDequant(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        cls.root_path = os.path.join(os.getcwd(),
+                                     "imperative_qat_aqd_" + timestamp)
+        cls.save_path = os.path.join(cls.root_path, "lenet")
+        cls.dynamic_root_path = os.path.join(os.getcwd(),
+                                             "dynamic_mnist_aqd_" + timestamp)
+        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.root_path)
+        shutil.rmtree(cls.dynamic_root_path)
+
     def test_qat_save(self):
 
         imperative_qat = ImperativeQuantAware(
@@ -228,6 +245,8 @@ def test_qat_save(self):
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".
                             format(epoch, batch_id,
                                    avg_loss.numpy(), acc.numpy()))
+                    if batch_id == 500:  # For shortening CI time
+                        break
 
                 lenet.eval()
                 for batch_id, data in enumerate(test_reader()):
@@ -264,11 +283,9 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./qat_infer_model/lenet"
-        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            path=path,
+            path=TestImperativeAddQuantDequant.save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -280,7 +297,7 @@ def test_qat_save(self):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=save_dir,
+             dirname=TestImperativeAddQuantDequant.root_path,
              executor=exe,
              model_filename="lenet" + INFER_MODEL_SUFFIX,
              params_filename="lenet" + INFER_PARAMS_SUFFIX)
@@ -378,7 +395,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
             lenet.eval()
         paddle.jit.save(
             layer=lenet,
-            path="./dynamic_mnist/model",
+            path=TestImperativeAddQuantDequant.dynamic_save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index 3ea1c84f976a85..da5c5d6dc9441b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -204,6 +204,66 @@ def test_post_training_kl(self):
                       quant_iterations)
 
 
+class TestPostTraininghistForMnist(TestPostTrainingQuantization):
+    def test_post_training_hist(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "hist"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "mse"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "avg"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
 class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
     def test_post_training_abs_max(self):
         model_name = "mnist_model"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 18389d9433b9a5..71611048610060 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -328,6 +328,50 @@ def test_post_training_kl_mobilenetv1(self):
                       diff_threshold)
 
 
+class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_avg_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "avg"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
+class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_hist_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "hist"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
 class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_abs_max_mobilenetv1(self):
         model = "MobileNet-V1"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 768a9ba7cfc3e7..790213d4b02924 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -257,6 +257,7 @@ def freeze_graph(self,
                      use_cuda,
                      seed,
                      activation_quant_type,
+                     bias_correction=False,
                      weight_quant_type='abs_max',
                      for_ci=True,
                      quant_skip_pattern='skip_quant'):
@@ -355,7 +356,8 @@ def build_program(main, startup, is_test):
 
         # Freeze graph for inference, but the weight of fc/conv is still float type.
         freeze_pass = QuantizationFreezePass(
-            scope=scope, place=place, weight_quantize_type=weight_quant_type)
+            scope=scope, place=place, bias_correction=bias_correction, \
+            weight_quantize_type=weight_quant_type)
         freeze_pass.apply(test_graph)
         if not for_ci:
             marked_nodes = set()
@@ -472,6 +474,13 @@ def test_freeze_graph_cpu_dynamic(self):
     def test_freeze_graph_cuda_static(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='range_abs_max',
+                    bias_correction=True,
+                    weight_quant_type='abs_max',
+                    for_ci=True)
                 self.freeze_graph(
                     True,
                     seed=1,
@@ -496,6 +505,13 @@ def test_freeze_graph_cuda_static(self):
                     activation_quant_type='moving_average_abs_max',
                     weight_quant_type='channel_wise_abs_max',
                     for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='moving_average_abs_max',
+                    bias_correction=True,
+                    weight_quant_type='channel_wise_abs_max',
+                    for_ci=True)
 
     def test_freeze_graph_cpu_static(self):
         with fluid.unique_name.guard():
diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
index 1e8fa51d635e32..744c97c514b361 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
@@ -15,6 +15,7 @@
 import unittest
 import os
 import time
+import numpy as np
 from paddle.dataset.common import download, DATA_HOME
 from paddle.fluid.contrib.slim.quantization import WeightQuantization
 import paddle
@@ -22,6 +23,28 @@
 paddle.enable_static()
 
 
+def _load_variable_data(scope, var_name):
+    '''
+    Load variable value from scope
+    '''
+    var_node = scope.find_var(var_name)
+    assert var_node is not None, \
+        "Cannot find " + var_name + " in scope."
+    return np.array(var_node.get_tensor())
+
+
+def _set_variable_data(scope, place, var_name, np_value):
+    '''
+    Set the value of var node by name, if the node exits,
+    '''
+    assert isinstance(np_value, np.ndarray), \
+        'The type of value should be numpy array.'
+    var_node = scope.find_var(var_name)
+    if var_node != None:
+        tensor = var_node.get_tensor()
+        tensor.set(np_value, place)
+
+
 class TestWeightQuantization(unittest.TestCase):
     def setUp(self):
         self.weight_quantization_dir = 'weight_quantization'
@@ -45,18 +68,20 @@ def cache_unzipping(self, target_folder, zip_path):
                                                           zip_path)
             os.system(cmd)
 
-    def run_test(self, model_name, model_data_url, model_data_md5, weight_bits,
-                 quantizable_op_type, weight_quantize_type, generate_test_model,
-                 threshold_rate):
+    def quantize_to_int(self, model_name, model_data_url, model_data_md5,
+                        weight_bits, quantizable_op_type, weight_quantize_type,
+                        generate_test_model, threshold_rate):
 
         model_dir = self.download_model(model_name, model_data_url,
                                         model_data_md5)
+        load_model_dir = os.path.join(model_dir, model_name)
 
         timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
         save_model_dir = os.path.join(
             os.getcwd(),
             model_name + "_wq_" + str(weight_bits) + "_" + timestamp)
-        weight_quant = WeightQuantization(model_dir=model_dir + "/model")
+
+        weight_quant = WeightQuantization(model_dir=load_model_dir)
         weight_quant.quantize_weight_to_int(
             save_model_dir=save_model_dir,
             weight_bits=weight_bits,
@@ -72,11 +97,79 @@ def run_test(self, model_name, model_data_url, model_data_md5, weight_bits,
             print("Failed to delete {} due to {}".format(save_model_dir, str(
                 e)))
 
+    def convert_to_fp16(self, model_name, model_data_url, model_data_md5,
+                        model_filename, params_filename):
+        model_dir = self.download_model(model_name, model_data_url,
+                                        model_data_md5)
+        load_model_dir = os.path.join(model_dir, model_name)
+
+        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        save_model_dir = os.path.join(os.getcwd(),
+                                      model_name + "_wq_fp16_" + timestamp)
+
+        weight_quant = WeightQuantization(load_model_dir, model_filename,
+                                          params_filename)
+
+        weight_quant.convert_weight_to_fp16(save_model_dir)
+
+        print("finish converting the data type of weights to fp16 for " +
+              model_name)
+        print("fp16 model saved in " + save_model_dir + "\n")
+
+        input_data = np.ones([1, 3, 224, 224], dtype=np.float32)
+        res_fp32 = self.run_models(load_model_dir, model_filename,
+                                   params_filename, input_data, False)
+        res_fp16 = self.run_models(save_model_dir, model_filename,
+                                   params_filename, input_data, True)
+
+        self.assertTrue(
+            np.allclose(
+                res_fp32, res_fp16, rtol=1e-5, atol=1e-08, equal_nan=True),
+            msg='Failed to test the accuracy of the fp32 and fp16 model.')
+
+        try:
+            os.system("rm -rf {}".format(save_model_dir))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(save_model_dir, str(
+                e)))
+
+    def run_models(self, model_dir, model_filename, params_filename, input_data,
+                   is_fp16_model):
+        print(model_dir)
+
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
+        with paddle.static.scope_guard(scope):
+            [inference_program, feed_target_names, fetch_targets] = \
+                paddle.fluid.io.load_inference_model(model_dir, exe,
+                    model_filename=model_filename,
+                    params_filename=params_filename)
+
+        if is_fp16_model:
+            for var in inference_program.list_vars():
+                if (var.type == paddle.fluid.core.VarDesc.VarType.RAW) or \
+                    (not var.persistable) or (var.name in ['feed', 'fetch']) \
+                    or (var.dtype != paddle.fluid.core.VarDesc.VarType.FP16):
+                    continue
+                tensor = _load_variable_data(scope, var.name)
+                _set_variable_data(scope, place, var.name,
+                                   tensor.astype(np.float32))
+
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: input_data},
+                          fetch_list=fetch_targets)
+        return np.array(results[0])
+
 
 class TestWeightQuantizationMobilenetv1(TestWeightQuantization):
-    model_name = "mobilenetv1"
-    model_data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz"
-    model_data_md5 = "13892b0716d26443a8cdea15b3c6438b"
+    nocomb_model_name = "mobilenetv1_fp32_nocombined"
+    nocomb_model_data_url = "https://paddle-inference-dist.cdn.bcebos.com/Paddle-Inference-Demo/mobilenetv1_fp32_nocombined.tar.gz"
+    nocomb_model_data_md5 = "c9aae3b04d9d535c84590ae557be0a0b"
+
+    comb_model_name = "mobilenetv1_fp32_combined"
+    comb_model_data_url = "https://paddle-inference-dist.cdn.bcebos.com/Paddle-Inference-Demo/mobilenetv1_fp32_combined.tar.gz"
+    comb_model_data_md5 = "087c67e2b2b0a8b689fcc570a56c005f"
 
     def test_weight_quantization_mobilenetv1_8bit_abs_max(self):
         weight_bits = 8
@@ -84,9 +177,10 @@ def test_weight_quantization_mobilenetv1_8bit_abs_max(self):
         weight_quantize_type = "abs_max"
         generate_test_model = True
         threshold_rate = 0.0
-        self.run_test(self.model_name, self.model_data_url, self.model_data_md5,
-                      weight_bits, quantizable_op_type, weight_quantize_type,
-                      generate_test_model, threshold_rate)
+        self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, weight_bits,
+                             quantizable_op_type, weight_quantize_type,
+                             generate_test_model, threshold_rate)
 
     def test_weight_quantization_mobilenetv1_8bit_channel_wise_abs_max(self):
         weight_bits = 8
@@ -94,19 +188,21 @@ def test_weight_quantization_mobilenetv1_8bit_channel_wise_abs_max(self):
         weight_quantize_type = "channel_wise_abs_max"
         generate_test_model = True
         threshold_rate = 0.0
-        self.run_test(self.model_name, self.model_data_url, self.model_data_md5,
-                      weight_bits, quantizable_op_type, weight_quantize_type,
-                      generate_test_model, threshold_rate)
+        self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, weight_bits,
+                             quantizable_op_type, weight_quantize_type,
+                             generate_test_model, threshold_rate)
 
     def test_weight_quantization_mobilenetv1_16bit_abs_max(self):
         weight_bits = 16
         quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul']
         weight_quantize_type = "abs_max"
         generate_test_model = False
-        threshold_rate = 1e-9
-        self.run_test(self.model_name, self.model_data_url, self.model_data_md5,
-                      weight_bits, quantizable_op_type, weight_quantize_type,
-                      generate_test_model, threshold_rate)
+        threshold_rate = 0
+        self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, weight_bits,
+                             quantizable_op_type, weight_quantize_type,
+                             generate_test_model, threshold_rate)
 
     def test_weight_quantization_mobilenetv1_16bit_channel_wise_abs_max(self):
         weight_bits = 16
@@ -114,9 +210,24 @@ def test_weight_quantization_mobilenetv1_16bit_channel_wise_abs_max(self):
         weight_quantize_type = "channel_wise_abs_max"
         generate_test_model = False
         threshold_rate = 1e-9
-        self.run_test(self.model_name, self.model_data_url, self.model_data_md5,
-                      weight_bits, quantizable_op_type, weight_quantize_type,
-                      generate_test_model, threshold_rate)
+        self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, weight_bits,
+                             quantizable_op_type, weight_quantize_type,
+                             generate_test_model, threshold_rate)
+
+    def test_mobilenetv1_fp16_combined(self):
+        model_filename = '__model__'
+        params_filename = '__params__'
+        self.convert_to_fp16(self.comb_model_name, self.comb_model_data_url,
+                             self.comb_model_data_md5, model_filename,
+                             params_filename)
+
+    def test_mobilenetv1_fp16_nocombined(self):
+        model_filename = None
+        params_filename = None
+        self.convert_to_fp16(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, model_filename,
+                             params_filename)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
index b190a5d02efc4c..850b267411ed5d 100644
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
@@ -258,7 +258,8 @@ def decorate_with_data_loader(self):
         cast_model_to_fp16(main_prog, use_fp16_guard=False)
 
     def test_non_iterable_dataloader(self):
-        self.decorate_with_data_loader()
+        if fluid.core.is_compiled_with_cuda():
+            self.decorate_with_data_loader()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 4c24eb3d7fcc8e..d3dc26c946df45 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -279,6 +279,7 @@ def to_list(s):
             from .core_avx import _set_process_signal_handler
             from .core_avx import _throw_error_if_process_failed
             from .core_avx import _convert_to_tensor_list
+            from .core_avx import _array_to_share_memory_tensor
             from .core_avx import _cleanup_mmap_fds
             from .core_avx import _remove_tensor_list_mmap_fds
     except Exception as e:
@@ -333,6 +334,7 @@ def to_list(s):
             from .core_noavx import _set_process_signal_handler
             from .core_noavx import _throw_error_if_process_failed
             from .core_noavx import _convert_to_tensor_list
+            from .core_noavx import _array_to_share_memory_tensor
             from .core_noavx import _cleanup_mmap_fds
             from .core_noavx import _remove_tensor_list_mmap_fds
     except Exception as e:
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 0cd12e874d9e36..167c7987c55d30 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -166,7 +166,9 @@ def _thread_loop(self, legacy_expected_place):
                 # pack as LoDTensorArray
                 array = core.LoDTensorArray()
                 for slot in batch:
-                    if not isinstance(slot, core.LoDTensor):
+                    if isinstance(slot, paddle.Tensor):
+                        slot = slot.value().get_tensor()
+                    elif not isinstance(slot, core.LoDTensor):
                         tmp = core.LoDTensor()
                         tmp.set(slot, core.CPUPlace())
                         slot = tmp
@@ -388,7 +390,9 @@ def _thread_loop(self, legacy_expected_place):
                             # LoDTensor not in shared memory is not
                             # serializable, cannot be create in workers
                             for slot in batch:
-                                if not isinstance(slot, core.LoDTensor):
+                                if isinstance(slot, paddle.Tensor):
+                                    slot = slot.value().get_tensor()
+                                elif not isinstance(slot, core.LoDTensor):
                                     tmp = core.LoDTensor()
                                     tmp.set(slot, core.CPUPlace())
                                     slot = tmp
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py
index 9382a704223704..41e12fbc68ec16 100644
--- a/python/paddle/fluid/dataloader/fetcher.py
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -27,8 +27,8 @@ def fetch(self, batch_indices):
 
 class _IterableDatasetFetcher(_DatasetFetcher):
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
-        super(_IterableDatasetFetcher, self).__init__(dataset, auto_collate_batch,
-                                              collate_fn, drop_last)
+        super(_IterableDatasetFetcher, self).__init__(
+            dataset, auto_collate_batch, collate_fn, drop_last)
         self.dataset_iter = iter(dataset)
 
     def fetch(self, batch_indices):
@@ -53,7 +53,8 @@ def fetch(self, batch_indices):
 
 class _MapDatasetFetcher(_DatasetFetcher):
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
-        super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch, collate_fn, drop_last)
+        super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch,
+                                                 collate_fn, drop_last)
 
     def fetch(self, batch_indices):
         if self.auto_collate_batch:
diff --git a/python/paddle/fluid/dataloader/flat.py b/python/paddle/fluid/dataloader/flat.py
index 6cccbc7ee4ea7d..db3a725ece01c2 100644
--- a/python/paddle/fluid/dataloader/flat.py
+++ b/python/paddle/fluid/dataloader/flat.py
@@ -36,14 +36,10 @@ def _flatten_batch(batch):
     def _flatten(batch, flat_batch, structure, field_idx):
         if isinstance(batch, Sequence):
             for field in batch:
-                if isinstance(field, np.ndarray):
+                if isinstance(field, (np.ndarray, paddle.Tensor)):
                     structure.append('{}{}'.format(FIELD_PREFIX, field_idx))
                     flat_batch.append(field)
                     field_idx += 1
-                elif isinstance(field, paddle.Tensor):
-                    structure.append('{}{}'.format(FIELD_PREFIX, field_idx))
-                    flat_batch.append(field.numpy())
-                    field_idx += 1
                 elif isinstance(field, (str, bytes, numbers.Number)):
                     structure.append(field)
                 elif isinstance(field, Sequence):
@@ -58,14 +54,10 @@ def _flatten(batch, flat_batch, structure, field_idx):
                     structure.append(field)
         elif isinstance(batch, Mapping):
             for k, field in batch.items():
-                if isinstance(field, np.ndarray):
+                if isinstance(field, (np.ndarray, paddle.Tensor)):
                     structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx)
                     flat_batch.append(field)
                     field_idx += 1
-                elif isinstance(field, paddle.Tensor):
-                    structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx)
-                    flat_batch.append(field.numpy())
-                    field_idx += 1
                 elif isinstance(field, (str, bytes, numbers.Number)):
                     structure[k] = field
                 elif isinstance(field, Sequence):
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index 2d1b554e53d68c..26bd1f06e12e84 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -238,7 +238,11 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
                     out_queue.put((idx, batch, None))
                 batch, structure = _flatten_batch(batch)
                 if use_shared_memory:
-                    tensor_list = core._convert_to_tensor_list(batch)
+                    tensor_list = [
+                        core._array_to_share_memory_tensor(b)
+                        if isinstance(b, np.ndarray) else b._share_memory()
+                        for b in batch
+                    ]
                     out_queue.put((idx, tensor_list, structure))
                     core._remove_tensor_list_mmap_fds(tensor_list)
                 else:
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 0f98af5772313d..7bcd10a726949b 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -102,6 +102,12 @@ def _gen_worker_desc(self, trainer_desc):
         # when opt_info is None or empty dict, it should return
         if not opt_info:
             return
+        downpour = trainer_desc.downpour_param
+        hogwild = trainer_desc.hogwild_param
+        if opt_info["stat_var_names"]:
+            for i in opt_info["stat_var_names"]:
+                hogwild.stat_var_names.extend([i])
+                downpour.stat_var_names.extend([i])
 
         from paddle.fluid.incubate.fleet.parameter_server import version
 
@@ -109,8 +115,6 @@ def _gen_worker_desc(self, trainer_desc):
             return
 
         program_configs = opt_info["program_configs"]
-        downpour = trainer_desc.downpour_param
-        hogwild = trainer_desc.hogwild_param
 
         for pid in program_configs:
             if pid == program_id:
@@ -161,10 +165,6 @@ def _gen_worker_desc(self, trainer_desc):
             sparse_table.emb_dim = -1
             # not use hard code click
             sparse_table.label_var_name = ""
-        if opt_info["stat_var_names"]:
-            for i in opt_info["stat_var_names"]:
-                hogwild.stat_var_names.extend([i])
-                downpour.stat_var_names.extend([i])
 
         for i in worker.get_desc().dense_table:
             if i.table_id in dense_table_set:
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 08d58e0c808b83..be5d9ac58311b5 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -26,6 +26,7 @@
 from ..data_feeder import convert_dtype
 import warnings
 from ..framework import _get_paddle_place
+import paddle
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index dd04b107204055..e80bc1245f9ce4 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -213,13 +213,25 @@ def __init__(self, sublayers=None):
             for idx, layer in enumerate(sublayers):
                 self.add_sublayer(str(idx), layer)
 
+    def _get_abs_idx(self, idx):
+        if isinstance(idx, int):
+            if not (-len(self) <= idx < len(self)):
+                raise IndexError(
+                    'index {} is out of range, should be an integer in range [{}, {})'.
+                    format(idx, -len(self), len(self)))
+            if idx < 0:
+                idx += len(self)
+        return idx
+
     def __getitem__(self, idx):
         if isinstance(idx, slice):
             return self.__class__(list(self._sub_layers.values())[idx])
         else:
+            idx = self._get_abs_idx(idx)
             return self._sub_layers[str(idx)]
 
     def __setitem__(self, idx, sublayer):
+        idx = self._get_abs_idx(idx)
         return setattr(self, str(idx), sublayer)
 
     def __delitem__(self, idx):
@@ -227,6 +239,7 @@ def __delitem__(self, idx):
             for k in range(len(self._sub_layers))[idx]:
                 delattr(self, str(k))
         else:
+            idx = self._get_abs_idx(idx)
             delattr(self, str(idx))
         str_indices = [str(i) for i in range(len(self._sub_layers))]
         self._sub_layers = OrderedDict(
@@ -275,10 +288,15 @@ def insert(self, index, sublayer):
                 another = paddle.nn.Linear(10, 10)
                 linears.insert(3, another)
                 print(linears[3] is another)  # True
+                another = paddle.nn.Linear(10, 10)
+                linears.insert(-1, another)
+                print(linears[-2] is another) # True
         """
         assert isinstance(index, int) and \
-               0 <= index < len(self._sub_layers), \
-            "index should be an integer in range [0, len(self))"
+               -len(self._sub_layers) <= index < len(self._sub_layers), \
+            "index should be an integer in range [{}, {})".format(-len(self), len(self))
+
+        index = self._get_abs_idx(index)
         for i in range(len(self._sub_layers), index, -1):
             self._sub_layers[str(i)] = self._sub_layers[str(i - 1)]
         self._sub_layers[str(index)] = sublayer
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 79d24c05184713..de788487feabc7 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -238,11 +238,16 @@ def is_required_ctx(ctxs, required_ctx):
         return new_name_ids
 
     def _is_call_func_name_node(self, node):
+        white_func_names = set(['append', 'extend'])
         if len(self.ancestor_nodes) > 1:
             assert self.ancestor_nodes[-1] == node
             parent_node = self.ancestor_nodes[-2]
             if isinstance(parent_node, gast.Call) and parent_node.func == node:
-                return True
+                # e.g: var_list.append(elem), var_list is also a name_id.
+                should_skip = isinstance(
+                    node, gast.Attribute) and node.attr in white_func_names
+                if not should_skip:
+                    return True
         return False
 
     def _update_name_ids(self, new_name_ids):
@@ -398,10 +403,13 @@ def _modified_vars(child_dict, parent_dict):
         ])
 
     def _vars_loaded_before_store(ids_dict):
+        """
+        gast.Param is also a kind of `load` semantic.
+        """
         new_dict = defaultdict(list)
         for k, ctxs in six.iteritems(ids_dict):
             for ctx in ctxs:
-                if isinstance(ctx, gast.Load):
+                if isinstance(ctx, (gast.Load, gast.Param)):
                     new_dict[k].append(ctx)
                 elif isinstance(ctx, gast.Store):
                     break
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index bd89a79c805c98..14bb54983b524a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -378,6 +378,21 @@ def _remove_unnecessary_vars(self, loop_vars, loop_node):
         :param loop_node: Current loop node.
         """
 
+        def filter_name_nodes_from(root_node, target_var_names):
+            """
+            Filter children with gast.Name type from node.(inclusivly)
+            """
+            name_nodes = set()
+            if isinstance(root_node, gast.Name):
+                if node.id in target_var_names:
+                    name_nodes.add(root_node)
+            for child_node in gast.walk(root_node):
+                if isinstance(child_node, gast.Name):
+                    if child_node.id in target_var_names:
+                        name_nodes.add(child_node)
+
+            return name_nodes
+
         vars_of_list_generator = set()
         target_vars_of_for_node = set()
 
@@ -412,15 +427,16 @@ def _remove_unnecessary_vars(self, loop_vars, loop_node):
 
                 # 1.2 vars from target vars used in elt_node
                 target_var_names = {var.id for var in target_vars}
-                listcomp_node = self._get_parent_node(parent_node)
-                elt_node = listcomp_node.elt
-                if isinstance(elt_node, gast.Name):
-                    if elt_node.id in target_var_names:
-                        vars_of_list_generator.add(elt_node)
-                for child_node in gast.walk(elt_node):
-                    if isinstance(child_node, gast.Name):
-                        if child_node.id in target_var_names:
-                            vars_of_list_generator.add(child_node)
+                comp_node = self._get_parent_node(parent_node)
+                elt_nodes = []
+                if isinstance(comp_node, gast.ListComp):
+                    elt_nodes.append(comp_node.elt)
+                elif isinstance(comp_node, gast.DictComp):
+                    elt_nodes.extend([comp_node.key, comp_node.value])
+
+                for node in elt_nodes:
+                    vars_of_list_generator |= filter_name_nodes_from(
+                        node, target_var_names)
 
             # 2. Get target vars or vars from target vars used in for-loop but the for-loop is
             #   1) not the "loop_node" itself
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 624ca085ac6c2d..001116a74c9cc5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -79,6 +79,7 @@ def visit(self, node):
 FOR_ITER_TUPLE_INDEX_PREFIX = '__for_loop_iter_tuple_index'
 FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len'
 FOR_ITER_VAR_NAME_PREFIX = '__for_loop_iter_var'
+FOR_ITER_ZIP_TO_LIST_PREFIX = '__for_loop_iter_zip'
 
 # FullArgSpec is valid from Python3. Defined a Namedtuple to
 # to make it available in Python2.
@@ -1012,6 +1013,9 @@ def __init__(self, for_node):
         #   - for i, x enumerate(var|var.numpy())
         #   - for x in var
         self.iter_var_len_name = unique_name.generate(FOR_ITER_VAR_LEN_PREFIX)
+        # - created zip to list var : __for_loop_iter_zip_0
+        self.iter_zip_to_list_name = unique_name.generate(
+            FOR_ITER_ZIP_TO_LIST_PREFIX)
 
         # - var.numpy()/var
         #   - for x in var|var.numpy()
@@ -1083,6 +1087,7 @@ def _parse_for_range_stmts(self):
 
     def _parse_for_stmts(self):
         init_stmts = []
+        init_stmts.extend(self._build_iter_node())
         init_stmts.append(self._build_index_init_node())
         init_stmts.append(self._build_var_len_assign_node())
 
@@ -1105,6 +1110,7 @@ def _parse_for_stmts(self):
 
     def _parse_for_enumerate_stmts(self):
         init_stmts = []
+        init_stmts.extend(self._build_iter_node())
         init_stmts.append(self._build_index_init_node())
         init_stmts.append(self._build_var_len_assign_node())
         init_stmts.append(self._build_enum_init_node())
@@ -1163,6 +1169,34 @@ def _build_var_len_assign_node(self):
 
         return convert_len_node
 
+    def _build_iter_node(self):
+        """
+        Process special cases for iter_node inclue:
+          - Case 1 (for zip):
+            
+            - for i, val in enumerate(zip(x, y))  # original code:
+            
+            - __for_loop_iter_zip_0 = list(zip(x, y))
+            - for i, val in enumerate(__for_loop_iter_zip_0)
+        """
+        new_nodes = []
+        if isinstance(self.iter_node, gast.Call) and isinstance(
+                self.iter_node.func, gast.Name):
+            if self.iter_node.func.id == 'zip':
+                iter_var_name = ast_to_source_code(self.iter_node).strip()
+                zip_to_list_str = "{target} = list({value})".format(
+                    target=self.iter_zip_to_list_name, value=iter_var_name)
+                zip_to_list_node = gast.parse(zip_to_list_str).body[0]
+                new_nodes.append(zip_to_list_node)
+
+                self.iter_node = gast.Name(
+                    id=self.iter_zip_to_list_name,
+                    ctx=gast.Load(),
+                    annotation=None,
+                    type_comment=None)
+
+        return new_nodes
+
     def _build_enum_init_node(self):
         if self.is_for_enumerate_iter() and self.args_length != 1:
             init_value_str = ast_to_source_code(self.iter_args[1]).strip()
@@ -1399,6 +1433,7 @@ def input_specs_compatible(src_input_specs, desired_input_specs):
         for spec in src_input_specs:
             if spec not in desired_input_specs:
                 return False
+
     else:
         for i in range(len_specs):
             src_shape = src_input_specs[i].shape
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 4b35d778459703..40ab19184c9c8c 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -509,33 +509,33 @@ def save(layer, path, input_spec=None, **configs):
     Saves input Layer as ``paddle.jit.TranslatedLayer``
     format model, which can be used for inference or fine-tuning after loading.
 
-    It will save the translated program and all related persistable 
+    It will save the translated program and all related persistable
     variables of input Layer to given ``path`` .
-    
-    ``path`` is the prefix of saved objects, and the saved translated program file 
+
+    ``path`` is the prefix of saved objects, and the saved translated program file
     suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` ,
-    and here also saved some additional variable description information to a file,  
+    and here also saved some additional variable description information to a file,
     its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
-      - ``paddle.jit.load`` 
-      - ``paddle.static.load_inference_model`` 
+      - ``paddle.jit.load``
+      - ``paddle.static.load_inference_model``
       - Other C++ inference APIs
 
     Args:
         layer (Layer): The Layer to be saved.
         path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward 
-            method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward
+            method, which can be described by InputSpec or example Tensor. If None, all input variables of
             the original Layer's forward method would be the inputs of the saved model. Default None.
-        **configs (dict, optional): Other save configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other save configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
             (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
-            By default, all return variables of original Layer's forward method are kept as the 
-            output of the saved model. If the provided ``output_spec`` list is not all output variables, 
-            the saved model will be pruned according to the given ``output_spec`` list. 
+            By default, all return variables of original Layer's forward method are kept as the
+            output of the saved model. If the provided ``output_spec`` list is not all output variables,
+            the saved model will be pruned according to the given ``output_spec`` list.
 
     Returns:
         None
@@ -793,8 +793,8 @@ def load(path, **configs):
     """
     :api_attr: imperative
 
-    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or 
-    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, 
+    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or
+    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``,
     then performing inference or fine-tune training.
 
     .. note::
@@ -807,14 +807,14 @@ def load(path, **configs):
 
     Args:
         path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix`` .
-        **configs (dict, optional): Other load configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other load configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (str): The inference model file name of the paddle 1.x 
-            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
-            (2) params_filename (str): The persistable variables file name of the paddle 1.x 
-            ``save_inference_model`` save format. No default file name, save variables separately 
+            (1) model_filename (str): The inference model file name of the paddle 1.x
+            ``save_inference_model`` save format. Default file name is :code:`__model__` .
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x
+            ``save_inference_model`` save format. No default file name, save variables separately
             by default.
 
 
@@ -960,7 +960,7 @@ def __len__(self):
             loader = paddle.io.DataLoader(dataset,
                 feed_list=[image, label],
                 places=place,
-                batch_size=BATCH_SIZE, 
+                batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
                 num_workers=2)
@@ -969,7 +969,7 @@ def __len__(self):
             for data in loader():
                 exe.run(
                     static.default_main_program(),
-                    feed=data, 
+                    feed=data,
                     fetch_list=[avg_loss])
 
             model_path = "fc.example.model"
@@ -1052,7 +1052,7 @@ def _trace(layer,
 class TracedLayer(object):
     """
     :api_attr: imperative
-    
+
     TracedLayer is used to convert a forward dygraph model to a static
     graph model. This is mainly used to save the dygraph model for online
     inference using C++. Besides, users can also do inference in Python
@@ -1132,7 +1132,7 @@ def __init__(self):
                     def forward(self, input):
                         return self._fc(input)
 
-                
+
                 layer = ExampleLayer()
                 in_var = paddle.uniform(shape=[2, 3], dtype='float32')
                 out_dygraph, static_layer = paddle.jit.TracedLayer.trace(layer, inputs=[in_var])
@@ -1244,13 +1244,16 @@ def __call__(self, inputs):
             return self._run(self._build_feed(inputs))
 
     @switch_to_static_graph
-    def save_inference_model(self, dirname, feed=None, fetch=None):
+    def save_inference_model(self, path, feed=None, fetch=None):
         """
         Save the TracedLayer to a model for inference. The saved
         inference model can be loaded by C++ inference APIs.
 
+        ``path`` is the prefix of saved objects, and the saved translated program file
+        suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` .
+
         Args:
-            dirname (str): the directory to save the inference model.
+            path(str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
             feed (list[int], optional): the input variable indices of the saved
                 inference model. If None, all input variables of the
                 TracedLayer object would be the inputs of the saved inference
@@ -1294,7 +1297,7 @@ def forward(self, input):
                 fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars)
                 print(fetch.shape) # (2, 10)
         """
-        check_type(dirname, "dirname", str,
+        check_type(path, "path", str,
                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
         check_type(feed, "feed", (type(None), list),
                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
@@ -1309,6 +1312,18 @@ def forward(self, input):
                 check_type(f, "each element of fetch", int,
                            "fluid.dygraph.jit.TracedLayer.save_inference_model")
 
+        # path check
+        file_prefix = os.path.basename(path)
+        if file_prefix == "":
+            raise ValueError(
+                "The input path MUST be format of dirname/file_prefix "
+                "[dirname\\file_prefix in Windows system], but received "
+                "file_prefix is empty string.")
+
+        dirname = os.path.dirname(path)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname)
+
         from paddle.fluid.io import save_inference_model
 
         def get_feed_fetch(all_vars, partial_vars):
@@ -1326,9 +1341,14 @@ def get_feed_fetch(all_vars, partial_vars):
                 assert target_var is not None, "{} cannot be found".format(name)
                 target_vars.append(target_var)
 
+            model_filename = file_prefix + INFER_MODEL_SUFFIX
+            params_filename = file_prefix + INFER_PARAMS_SUFFIX
+
             save_inference_model(
                 dirname=dirname,
                 feeded_var_names=feeded_var_names,
                 target_vars=target_vars,
                 executor=self._exe,
-                main_program=self._program.clone())
+                main_program=self._program.clone(),
+                model_filename=model_filename,
+                params_filename=params_filename)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index b157ce81d82fc7..b4959764742213 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -22,6 +22,9 @@
 import weakref
 import warnings
 from copy import deepcopy
+import inspect
+
+import paddle
 
 from . import parallel_helper
 from .. import unique_name
@@ -33,6 +36,7 @@
 from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.framework import _current_expected_place as _get_device
+from paddle.fluid.dygraph import no_grad
 import paddle.utils.deprecated as deprecated
 
 __all__ = ['Layer']
@@ -894,9 +898,15 @@ def __call__(self, *inputs, **kwargs):
             if not self._built:
                 with program_desc_tracing_guard(False):
                     self._build_once(*inputs, **kwargs)
-                    if parallel_helper._is_data_parallel_mode():
+
+                    # TODO(liuyuhui) Only xpu broadcast parameters here. 
+                    # The other device is to call _sync_params_buffers in DataParallel 
+                    # to realize the parameter synchronization among multiply cards.
+                    if parallel_helper._is_data_parallel_mode(
+                    ) and paddle.is_compiled_with_xpu():
                         parallel_helper._broadcast_parameters(
                             self._parameters.values())
+
                 self._built = True
 
             outputs = self.forward(*inputs, **kwargs)
@@ -1287,10 +1297,12 @@ def _check_match(key, param):
             if state is None:
                 raise ValueError("{} is not found in the provided dict.".format(
                     key))
-            if list(state.shape) != list(param.shape):
+            state_shape = state.shape() if inspect.ismethod(
+                state.shape) else state.shape
+            if list(state_shape) != list(param.shape):
                 raise ValueError(
                     "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
+                    format(key, list(state_shape), list(param.shape)))
             return param, state
 
         matched_param_state = []
@@ -1332,6 +1344,114 @@ def _set_var(var, ndarray):
             for param, state in matched_param_state:
                 _set_var(param, state)
 
+    def _apply(self, func, device, dtype, blocking):
+        for layer in self.children():
+            layer._apply(func, device, dtype, blocking)
+
+        for key, param in self._parameters.items():
+            if param is not None:
+                with no_grad():
+                    param_applied = func(param, device, dtype, blocking)
+                    assert param.is_leaf
+                    param_applied.stop_gradient = param.stop_gradient
+                    self._parameters[key] = param_applied
+
+                if param.grad is not None:
+                    with no_grad():
+                        grad_applied = func(param._grad_ivar(), device, dtype,
+                                            blocking)
+
+                        grad_applied.stop_gradient = param._grad_ivar(
+                        ).stop_gradient
+                        self._parameters[key]._set_grad_ivar(grad_applied)
+
+        for key, buf in self._buffers.items():
+            self._buffers[key] = func(buf, device, dtype, blocking)
+
+    def to(self, device=None, dtype=None, blocking=None):
+        '''
+        Cast the parameters and buffers of Layer by the give device, dtype and blocking.
+
+        Parameters:
+            device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored. 
+            If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
+            index of the GPUs or XPUs. Default: None. 
+            
+            dtype(str|core.VarDesc.VarType|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None.
+
+            blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be 
+              asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
+            
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                linear=paddle.nn.Linear(2, 2)
+                linear.weight
+                #Parameter containing:
+                #Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+
+                linear.to(dtype='float64')
+                linear.weight
+                #Tenor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+
+                linear.to(device='cpu')
+                linear.weight
+                #Tensor(shape=[2, 2], dtype=float64, place=CPUPlace, stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+                linear.to(device=paddle.CUDAPinnedPlace(), blocking=False)
+                linear.weight
+                #Tensor(shape=[2, 2], dtype=float64, place=CUDAPinnedPlace, stop_gradient=False,
+                #       [[-0.04989364, -0.56889004],
+                #        [ 0.33960250,  0.96878713]])
+    
+
+        '''
+
+        if device is None and dtype is None and blocking is None:
+            return
+
+        if device is not None:
+            if isinstance(device, str):
+                device = paddle.device._convert_to_place(device)
+            elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
+                                     core.CUDAPinnedPlace, core.XPUPlace)):
+                pass
+            else:
+                raise ValueError(
+                    "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
+                    + type(device).__name__)
+
+        if blocking is None:
+            blocking = True
+        else:
+            assert isinstance(
+                blocking,
+                bool), "blocking value error, must be the True, False or None"
+
+        def transform(t, device, dtype, blocking):
+            if device is None:
+                device = t.place
+            if dtype is None:
+                dtype = t.dtype
+
+            new_t = t._copy_to(device, blocking)
+            if dtype is not None and dtype != t.dtype:
+                new_t = new_t.cast(dtype=dtype)
+
+            return new_t
+
+        self._apply(transform, device, dtype, blocking)
+
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
     load_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 1df3e31ae4b269..41cce6a0858a6e 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -29,6 +29,7 @@
     core.VarDesc.VarType.INT16,
     core.VarDesc.VarType.INT32,
     core.VarDesc.VarType.INT64,
+    core.VarDesc.VarType.BOOL,
 ]
 
 # NOTE(chenweihang): We currently do not fully support the type promotion 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 6decff69ad65cf..ce728f1121dfdb 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -174,6 +174,11 @@ def __init__(self,
                  dtype='float32'):
         assert param_attr is not False, "param_attr should not be False here."
         super(Conv2D, self).__init__()
+
+        if (core.is_compiled_with_cuda() and paddle.fluid.get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+            use_cudnn = False
+
         self._num_channels = num_channels
         self._groups = groups
         self._stride = utils.convert_to_list(stride, 2, 'stride')
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 2ef72f6c5aaf4b..b80621e21f1c5c 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -24,6 +24,7 @@
 from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.dygraph import to_variable, no_grad
 from paddle.utils import deprecated
+from ..layers import collective
 import warnings
 import paddle
 import itertools
@@ -348,6 +349,18 @@ class DataParallel(layers.Layer):
         last_comm_buffer_size(float, optional): It limits memory size(MB) of last buffer in communication
                                          calling. Making the last communication buffer size small is useful to 
                                          improve performance. Default: 1.
+        find_unused_parameters(bool, optional): Whether to traverse the entire backward graph from the
+                                                all tensors in the return value of the wrapped model's 
+                                                forward function. For parameters not involved in loss 
+                                                calculation, their gradients will be marked as ready in 
+                                                advance to prepare reduce. Please note that all forward 
+                                                outputs derived from the wrapped model parameters must 
+                                                participate in the calculation of loss and subsequent 
+                                                gradient calculations. If not, serious error will occur.
+                                                Note that setting the find_unused_parameters to True 
+                                                will affect computing performance. Therefore, if all parameters
+                                                are sure to participate in the loss calculation and the 
+                                                autograd graph construction, please set it False. Default: True.
             
     Returns:
         Layer: The data paralleled module.
@@ -403,11 +416,13 @@ def __init__(self,
                  layers,
                  strategy=None,
                  comm_buffer_size=25,
-                 last_comm_buffer_size=1):
+                 last_comm_buffer_size=1,
+                 find_unused_parameters=True):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
+        self.find_unused_parameters = find_unused_parameters
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
@@ -419,6 +434,17 @@ def __init__(self,
             self._strategy = _build_default_parallel_strategy()
 
         if self._strategy.nranks > 1:
+            # check the environment
+            assert parallel_helper.__parallel_ctx__clz__ is not None, \
+            "ParallelContext must be initialized before. You should use init_parallel_env() before" \
+            "constructing the DataParallel."
+
+            # sync buffer and params
+            # TODO(liuyuhui) Currently not support xpu. xpu is 
+            # still broadcasting parameters when calling layer
+            if not paddle.is_compiled_with_xpu():
+                self._sync_params_buffers()
+
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
             # NOTE(shenliang03): We can set environment variables to control 
             # the size of the group, Default: 1MB. The role of this small group is: 
@@ -449,6 +475,10 @@ def init_reducer(self):
 
         trainable_parameters = [param for _, param in layers_param]
 
+        assert len(trainable_parameters) > 0, \
+            "This model does not have any parameters to train, and " \
+            "does not need to use DataParallel"
+
         # NOTE(shenliang03): Here we can only use the attributes to judge whether
         # parameter is sparse(or SelectedRows). The reason is that the sparse message
         # can't be obtained when bp hasn't happened yet. So if layer supports sparse parameter,
@@ -470,19 +500,12 @@ def check_layer_sparse(sublayer):
             trainable_parameters, is_sparse_gradient,
             [self.last_comm_buffer_size, self.comm_buffer_size])
 
-        assert parallel_helper.__parallel_ctx__clz__ is not None, \
-            "ParallelContext must be initialized before. You should use init_parallel_env() before" \
-            "constructing the DataParallel."
-
-        # TODO(shenliang03) "find_unused_vars" interface will be exposed in the future 
-        # to handle control flow to process unused parameters
-        find_unused_vars = True
         self._reducer = core.Reducer(
             trainable_parameters,
             list(reversed(self.group_indices)), is_sparse_gradient,
             parallel_helper.__parallel_ctx__clz__,
             [self.last_comm_buffer_size, self.comm_buffer_size],
-            find_unused_vars)
+            self.find_unused_parameters)
 
     def _find_varbase(self, obj):
         if isinstance(obj, core.VarBase):
@@ -493,11 +516,54 @@ def _find_varbase(self, obj):
             return itertools.chain(*map(self._find_varbase, obj.values()))
         return []
 
+    def _sync_params_buffers(self):
+        model_vars = []
+        for _, param in self._layers.state_dict().items():
+            if not isinstance(param, core.VarBase):
+                raise TypeError("The data type of '%s' must be Varbase" %
+                                param.name)
+            model_vars.append(param.detach())
+        if len(model_vars) == 0:
+            return
+
+        mega_bytes = 128 * 1024 * 1024
+        group_idx = 0
+        memory_counter = 0
+        var_groups = OrderedDict()
+        dtype = model_vars[0].dtype
+
+        for var in model_vars:
+            bytes = np.prod(var.shape) * core.size_of_dtype(var.dtype)
+            if memory_counter < mega_bytes and dtype == var.dtype:
+                memory_counter += bytes
+            else:
+                memory_counter = 0
+                dtype = var.dtype
+                group_idx += 1
+            var_groups.setdefault(group_idx, []).append(var)
+
+        coalesced_vars = _coalesce_tensors(var_groups)
+
+        for coalesced_var, _, _ in coalesced_vars:
+            collective._broadcast(coalesced_var, root=0, sync_mode=True)
+
+        for coalesced_var, origin_vars, var_shapes in coalesced_vars:
+            var_len = [np.prod(v_shape) for v_shape in var_shapes]
+            framework._dygraph_tracer().trace_op(
+                type='split',
+                inputs={'X': coalesced_var},
+                outputs={'Out': origin_vars},
+                attrs={'sections': var_len,
+                       'axis': 0})
+
     def forward(self, *inputs, **kwargs):
         outputs = self._layers(*inputs, **kwargs)
-        if self._strategy.nranks > 1:
-            self._reducer.prepare_for_backward(
-                list(self._find_varbase(outputs)))
+        if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad:
+            if self.find_unused_parameters:
+                self._reducer.prepare_for_backward(
+                    list(self._find_varbase(outputs)))
+            else:
+                self._reducer.prepare_for_backward(list(self._find_varbase([])))
 
         return outputs
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index ac0944c5718908..ac594709867d1c 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -14,6 +14,8 @@
 
 import inspect
 import numpy as np
+import warnings
+import weakref
 
 import paddle
 from .. import framework
@@ -26,6 +28,34 @@
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
 
 
+class TensorHookRemoveHelper(object):
+    """
+    A helper class that for removing Tensor gradient's hook.
+    """
+
+    def __init__(self, tensor, hook_id):
+        self._tensor_ref = weakref.ref(tensor)
+        self._hook_id = hook_id
+
+    def remove(self):
+        """
+        Remove reference Tensor's hook.
+
+        Returns:
+            bool: Return True if removed successfully
+        """
+        tensor = self._tensor_ref()
+        if tensor is not None:
+            res = tensor._remove_grad_hook(self._hook_id)
+            if res is True:
+                return True
+            else:
+                warnings.warn(
+                    "The backward hook (ID: %d) of Tensor `%s` you want to remove does not exist or has been removed."
+                    % (self._hook_id, tensor.name), RuntimeWarning)
+        return False
+
+
 def monkey_patch_varbase():
     @switch_to_static_graph
     def _to_static_var(self, to_parameter=False, **kwargs):
@@ -133,7 +163,7 @@ def set_value(self, value):
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, retain_graph=False):
+    def backward(self, grad_tensor=None, retain_graph=False):
         """
         Run backward of current Graph which starts from current Tensor.
 
@@ -142,17 +172,22 @@ def backward(self, retain_graph=False):
         You can clear gradient by ``Tensor.clear_grad()`` .
 
         Args:
+            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
+            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
+            if `grad_tensor` is not None, it must have the same length as the current Tensor.
+            Teh default value is None.
+
             retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
                 like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
                 :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
                 Defaults to False.
-
         Returns:
             NoneType: None
 
         Examples:
             .. code-block:: python
 
+                import paddle
                 x = paddle.to_tensor(5., stop_gradient=False)
                 for i in range(5):
                     y = paddle.pow(x, 4.0)
@@ -168,15 +203,36 @@ def backward(self, retain_graph=False):
                 print("{}".format(x.grad))
                 # 0.
 
+                grad_tensor=paddle.to_tensor(2.)
+                for i in range(5):
+                    y = paddle.pow(x, 4.0)
+                    y.backward(grad_tensor)
+                    print("{}: {}".format(i, x.grad))
+                # 0: [1000.]
+                # 1: [2000.]
+                # 2: [3000.]
+                # 3: [4000.]
+                # 4: [5000.]
+
         """
         if framework.in_dygraph_mode():
+            if grad_tensor is not None:
+                assert isinstance(
+                    grad_tensor, paddle.
+                    Tensor), "The type of grad_tensot must be paddle.Tensor"
+                assert grad_tensor.shape == self.shape, \
+                    "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
+                    grad_tensor.name, grad_tensor.shape, self.name, self.shape)
+
             if paddle.is_compiled_with_xpu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                scaled_loss._run_backward(framework._dygraph_tracer(),
-                                          retain_graph)
+                core.dygraph_run_backward([scaled_loss], [grad_tensor],
+                                          retain_graph,
+                                          framework._dygraph_tracer())
             else:
-                self._run_backward(framework._dygraph_tracer(), retain_graph)
+                core.dygraph_run_backward([self], [grad_tensor], retain_graph,
+                                          framework._dygraph_tracer())
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -211,6 +267,73 @@ def gradient(self):
         else:
             return np.array(new_ivar.value().get_tensor())
 
+    @framework.dygraph_only
+    def register_hook(self, hook):
+        """
+        Registers a backward hook for current Tensor.
+
+        The hook will be called every time the gradient Tensor of current Tensor is computed.
+
+        The hook should not modify the input gradient Tensor, but it can optionally return
+        a new gradient Tensor which will be used in place of current Tensor's gradient.
+
+        The hook should have the following signature:
+
+            hook(grad) -> Tensor or None
+
+        Args:
+            hook(function): A backward hook to be registered for Tensor.grad
+
+        Returns:
+            TensorHookRemoveHelper: A helper object that can be used to remove the registered hook by calling `remove()` method.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                # hook function return None
+                def print_hook_fn(grad):
+                    print(grad)
+
+                # hook function return Tensor
+                def double_hook_fn(grad):
+                    grad = grad * 2
+                    return grad
+
+                x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False)
+                y = paddle.to_tensor([4., 5., 6., 7.], stop_gradient=False)
+                z = paddle.to_tensor([1., 2., 3., 4.])
+
+                # one Tensor can register multiple hooks
+                h = x.register_hook(print_hook_fn)
+                x.register_hook(double_hook_fn)
+
+                w = x + y
+                # register hook by lambda function
+                w.register_hook(lambda grad: grad * 2)
+
+                o = z.matmul(w)
+                o.backward()
+                # print_hook_fn print content in backward
+                # Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #        [2., 4., 6., 8.])
+
+                print("w.grad:", w.grad) # w.grad: [1. 2. 3. 4.]
+                print("x.grad:", x.grad) # x.grad: [ 4.  8. 12. 16.]
+                print("y.grad:", y.grad) # y.grad: [2. 4. 6. 8.]
+
+                # remove hook
+                h.remove()
+        """
+        if self.stop_gradient is True:
+            raise RuntimeError(
+                "Cannot register hook on a tensor that stop gradient.")
+
+        hook_id = self._register_grad_hook(hook)
+        helper = TensorHookRemoveHelper(self, hook_id)
+        return helper
+
     @property
     def grad(self):
         """
@@ -316,7 +439,8 @@ def __bool__(self):
         ("_to_static_var", _to_static_var), ("set_value", set_value),
         ("block", block), ("backward", backward), ("clear_grad", clear_grad),
         ("inplace_version", inplace_version), ("grad", grad),
-        ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__),
+        ("gradient", gradient), ("register_hook", register_hook),
+        ("__str__", __str__), ("__repr__", __str__),
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
         ("__name__", "Tensor")):
         setattr(core.VarBase, method_name, method)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index da326ec074c1d9..76bc68f24d2fef 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1213,6 +1213,7 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
             # In distributed training, the compiled program is saved in Program._graph
             has_compiled_graph = isinstance(program._graph,
                                             compiler.CompiledProgram)
+
             if has_compiled_graph:
                 program._graph._compile(scope, self.place)
                 # _graph in program does not support inference since the _graph is optimized
@@ -1372,11 +1373,14 @@ def _prepare_trainer(self,
                          fetch_info=None,
                          print_period=100):
         is_heter = 0
+        use_ps_gpu = 0
         if not program._fleet_opt is None:
             if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
                 is_heter = 1
             if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer":
                 is_heter = 1
+            if program._fleet_opt.get("use_ps_gpu", False):
+                use_ps_gpu = True
         if scope is None:
             scope = global_scope()
         if fetch_list is None:
@@ -1411,7 +1415,9 @@ def _prepare_trainer(self,
             trainer._set_program(program.program)
 
         if thread <= 0:
-            if dataset.thread_num <= 0:
+            if use_ps_gpu:
+                trainer._set_thread(len(program._fleet_opt["worker_places"]))
+            elif dataset.thread_num <= 0:
                 raise RuntimeError(
                     "You should set thread num first, either in Dataset"
                     "or in Executor.train_from_dataset")
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 18162059e998eb..5845a2c78ecf9d 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -24,6 +24,7 @@
 import traceback
 import six
 import copy
+from types import MethodType, FunctionType
 
 import numpy as np
 import subprocess
@@ -53,7 +54,6 @@
     'is_compiled_with_cuda',
     'is_compiled_with_xpu',
     'Variable',
-    'load_op_library',
     'require_version',
     'device_guard',
     'set_flags',
@@ -1184,37 +1184,6 @@ def numpy(self):
         """
         pass
 
-    @fake_interface_only
-    def set_value(self, value):
-        """
-        **Notes**:
-            **This API is ONLY available in Dygraph mode**
-
-        Set a new value for this Variable.
-
-        Args:
-            value (Variable|np.ndarray): the new value.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import Linear
-                import numpy as np
-
-                data = np.ones([3, 1024], dtype='float32')
-                with fluid.dygraph.guard():
-                    linear = fluid.dygraph.Linear(1024, 4)
-                    t = to_variable(data)
-                    linear(t)  # call with default weight
-                    custom_weight = np.random.randn(1024, 4).astype("float32")
-                    linear.weight.set_value(custom_weight)  # change existing weight
-                    out = linear(t)  # call with different weight
-
-        """
-        pass
-
     @fake_interface_only
     def backward(self, retain_graph=False):
         """
@@ -2006,11 +1975,165 @@ def replace_ellipsis(item):
                 "paddle.Tensor to a paddle.Tensor, but received {}".format(
                     type(value)))
 
-        self.block.append_op(
+        cur_block = default_main_program().current_block()
+        cur_block.append_op(
             type="set_value", inputs=inputs, outputs={'Out': self}, attrs=attrs)
 
         return self
 
+    def get_value(self, scope=None):
+        """
+        Get the value of variable in given scope. 
+
+        Args:
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
+                Default: None
+
+        Returns:
+            Tensor: the value in given scope.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static 
+                import numpy as np
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+
+                y = static.nn.fc(x, 10, name='fc')
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                prog = paddle.static.default_main_program()
+                exe.run(static.default_startup_program())
+                inputs = np.ones((10, 10), dtype='float32')
+                exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
+                path = 'temp/tensor_'
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t = var.get_value()
+                        paddle.save(t, path+var.name+'.pdtensor')
+
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t_load = paddle.load(path+var.name+'.pdtensor')
+                        var.set_value(t_load)
+        """
+        # The 'framework' is a low-level module, and 'executor' 
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+        var_temp = scope.find_var(self.name)
+        if var_temp is None:
+            raise ValueError("Can not find Variable '{}' in the Scope.".format(
+                self.name))
+        t = var_temp.get_tensor()
+        return t
+
+    def set_value(self, value, scope=None):
+        '''
+        Set the value to the tensor in given scope. 
+
+        Args:
+            value(Tensor/ndarray) : The value to be set.
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
+                Default: None
+
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static 
+                import numpy as np
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+
+                y = static.nn.fc(x, 10, name='fc')
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                prog = paddle.static.default_main_program()
+                exe.run(static.default_startup_program())
+                inputs = np.ones((10, 10), dtype='float32')
+                exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
+                path = 'temp/tensor_'
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t = var.get_value()
+                        paddle.save(t, path+var.name+'.pdtensor')
+
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t_load = paddle.load(path+var.name+'.pdtensor')
+                        var.set_value(t_load)
+        '''
+
+        # The 'framework' is a low-level module, and 'executor'
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+
+        if not (isinstance(value, np.ndarray) or hasattr(value, '__array__')):
+            raise TypeError(
+                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}.".
+                format(type(value)))
+
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+
+        var_temp = scope.find_var(self.name)
+        if var_temp is None:
+            raise ValueError("Can not find Variable '{}' in the Scope.".format(
+                self.name))
+
+        t = var_temp.get_tensor()
+
+        if hasattr(value, 'shape'):
+            if isinstance(value.shape, (MethodType, FunctionType)):
+                value_shape = value.shape()
+            else:
+                value_shape = value.shape
+            if list(t.shape()) != list(value_shape):
+                raise ValueError(
+                    "{} expected a shape {}, but the received shape is {}.".
+                    format(self.name, list(t.shape()), list(value_shape)))
+
+        p = t._place()
+        if p.is_cpu_place():
+            place = core.CPUPlace()
+        elif p.is_cuda_pinned_place():
+            place = core.CUDAPinnedPlace()
+        elif p.is_xpu_place():
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.XPUPlace(p.xpu_device_id())
+        else:
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.CUDAPlace(p.gpu_device_id())
+
+        t.set(value, place)
+
 
 def get_all_op_protos():
     """
@@ -3116,10 +3239,7 @@ def _insert_op(self, index, *args, **kwargs):
             Operator: the insert Operator.
         """
         self._sync_with_cpp()
-        op_desc = self.desc._insert_op(index)
-        op = Operator(block=self, desc=op_desc, *args, **kwargs)
-        self.ops.insert(index, op)
-        return op
+        return self._insert_op_without_sync(index, *args, **kwargs)
 
     def _insert_op_without_sync(self, index, *args, **kwargs):
         """
@@ -5319,6 +5439,173 @@ def all_parameters(self):
             parameters.extend(each_block.all_parameters())
         return parameters
 
+    def state_dict(self, mode='all', scope=None):
+        """
+        Get parameters and persistable buffers of program as a dict. The key is the name of the parameter or the name of the buffer.
+        The value is the tensor of this variable in the given scope.
+
+        .. note::
+            This function MUST called after run start_up_program
+
+        Args:
+            mode(str, optional): Source of the obtained parameters and buffers. 
+                    'opt' :  The return value only contains the variable in the optimizer. 
+                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.  
+                    'all' : The return value contains the variable in the network and optimizer.
+                    Default: 'all'
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
+                Default: None
+
+        Retruns:
+            dict: a dict contains the parameters and persistable buffers.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+                y = static.nn.fc(x, 10)
+                z = static.nn.fc(y, 10)
+
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                exe.run(static.default_startup_program())
+                prog = static.default_main_program()
+
+                path = "./temp/model.pdparams"
+                paddle.save(prog.state_dict(), path)
+        """
+        # The 'framework' is a low-level module, and 'executor'
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope'` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+
+        if not isinstance(mode, str):
+            raise TypeError("Type of `mode` should be string, but received {}.".
+                            format(type(mode)))
+
+        def is_parameter(var):
+            return isinstance(var, Parameter)
+
+        def is_persistable(var):
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        def is_belong_to_optimizer(var):
+            if not (isinstance(var, Parameter) or var.desc.need_check_feed()):
+                return is_persistable(var)
+            return False
+
+        def condition(var):
+
+            if mode == 'param':
+                return is_parameter(var)
+            elif mode == 'opt':
+                return is_belong_to_optimizer(var)
+            elif mode == 'all':
+                return is_parameter(var) or is_belong_to_optimizer(var)
+            else:
+                raise ValueError(
+                    "`mode` string should be 'param', 'opt' or 'all', but received {}.".
+                    format(mode))
+
+        var_list = filter(condition, self.list_vars())
+
+        state_dict = dict()
+        for var in var_list:
+            var_temp = scope.find_var(var.name)
+            if var_temp is None:
+                raise ValueError(
+                    "Can not find Variable '{}' in the scope. Make sure it is initialized".
+                    format(var.name))
+            state_dict[var.name] = var_temp.get_tensor()
+
+        return state_dict
+
+    def set_state_dict(self, state_dict, scope=None):
+        """
+        Set parameters and persistable buffers in state_dict to program. 
+        An exception will throw if shape or dtype of the parameters is not match.
+        
+        .. note::
+            This function MUST called after run start_up_program
+
+        Args:
+            state_dict(dict): the dict store parameters and persistable buffers. 
+                The key is the name of the parameter or the name of the buffer.
+                The value is the tensor of this variable in the given scope.
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
+                Default: None
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+                y = static.nn.fc(x, 10)
+                z = static.nn.fc(y, 10)
+
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                exe.run(static.default_startup_program())
+                prog = static.default_main_program()
+
+                path = "./temp/model.pdparams"
+                paddle.save(prog.state_dict(), path)
+                state_dict_load = paddle.load(path)
+                prog.set_state_dict(state_dict_load)
+        """
+
+        if not isinstance(state_dict, dict):
+            raise TypeError(
+                "Type of `state_dict` should be dict, but received {}.".format(
+                    type(state_dict)))
+
+        vars_dict = {var.name: var for var in self.list_vars()}
+        condition = True if 'StructuredToParameterName@@' in state_dict else False
+        for name, value in state_dict.items():
+            if condition:
+                if name == "StructuredToParameterName@@":
+                    continue
+                if name in state_dict['StructuredToParameterName@@']:
+                    name = state_dict['StructuredToParameterName@@'][name]
+            if name in vars_dict:
+                try:
+                    vars_dict[name].set_value(value, scope)
+                except ValueError as err:
+                    warnings.warn(
+                        ("Skip loading for '{}'. ".format(name) + str(err)))
+                except TypeError as err:
+                    warnings.warn(
+                        ("Skip loading for '{}'. ".format(name) + str(err)))
+            else:
+                warnings.warn((
+                    "Skip loading for '{0}'. Because '{0}' not in the program.".
+                    format(name)))
+
 
 @six.add_metaclass(ParameterMetaClass)
 class Parameter(Variable):
@@ -5770,33 +6057,6 @@ def _dygraph_place_guard(place):
         _set_dygraph_tracer_expected_place(tmp_place)
 
 
-def load_op_library(lib_filename):
-    """
-    :api_attr: Static Graph
-
-    Load a dynamic library, including custom operators and kernels.
-    When library is loaded, ops and kernels registered in the library
-    will be available in PaddlePaddle main process.
-    Please note, the type of custom operators can't have the same type
-    with the existing operators in the framework.
-
-    Args:
-        lib_filename (str): name of dynamic library.
-    
-    Returns:
-        list[str]: new registered custom op names.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            #fluid.load_op_library('custom_op.so')
-
-    """
-    core.load_op_library(lib_filename)
-    return OpProtoHolder.instance().update_op_proto()
-
-
 def switch_device(device):
     global _current_device
     pre_device = _current_device
@@ -5938,7 +6198,7 @@ def _get_paddle_place(place):
     if place is None:
         return place
     if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
-                          core.CUDAPinnedPlace, core.CUDAPlace)):
+                          core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace)):
         return place
 
     if not isinstance(place, str):
@@ -5948,9 +6208,11 @@ def _get_paddle_place(place):
     place = place.lower()
     if (place == "cpu"):
         return core.CPUPlace()
+
     if (place == "device"):
         return core.Place()
 
+    # GPU
     avaliable_gpu_place = re.match(r'gpu:\d+', place)
     if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place:
         if not core.is_compiled_with_cuda():
@@ -5966,6 +6228,8 @@ def _get_paddle_place(place):
             device_id = place_info_list[1]
             device_id = int(device_id)
             return core.CUDAPlace(device_id)
+
+    # XPU
     avaliable_xpu_place = re.match(r'xpu:\d+', place)
     if avaliable_xpu_place:
         if not core.is_compiled_with_xpu():
@@ -5976,9 +6240,22 @@ def _get_paddle_place(place):
         device_id = place_info_list[1]
         device_id = int(device_id)
         return core.XPUPlace(device_id)
+
+    # NPU
+    avaliable_npu_place = re.match(r'npu:\d+', place)
+    if avaliable_npu_place:
+        if not core.is_compiled_with_npu():
+            raise ValueError(
+                "The device should not be {}, since PaddlePaddle is " \
+                "not compiled with NPU".format(avaliable_npu_place))
+        place_info_list = place.split(':', 1)
+        device_id = place_info_list[1]
+        device_id = int(device_id)
+        return core.NPUPlace(device_id)
+
     raise ValueError(
-        "paddle support CPUPlace, CUDAPlace,CUDAPinnedPlace and XPUPlace, Please check your Place Input"
-    )
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace and NPUPlace, but received {}.".
+        format(place))
 
 
 def _get_paddle_place_list(places):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 08e64c15c483b7..5f327497047470 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -24,6 +24,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
+import paddle.compat as cpt
 
 from paddle.fluid.transpiler.details.program_utils import delete_ops
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
@@ -93,7 +94,7 @@ def _add_lr_var(main_program, compiled_config):
     return program
 
 
-def distributed_ops_pass(program, config):
+def distributed_ops_pass(program, config, use_ps_gpu=False):
     trainer_id = config.get_role_id()
     send_ctx = config.get_the_one_send_context(
         split_dense_table=config.is_heter_ps_mode)
@@ -109,7 +110,7 @@ def _get_pull_sparse_ops(_program):
                 pull_sparse_ops[param_name] = ops
         return pull_sparse_ops
 
-    def _pull_sparse_fuse(_program, pull_sparse_ops):
+    def _pull_sparse_fuse(_program, pull_sparse_ops, use_ps_gpu):
         for param, ops in pull_sparse_ops.items():
             all_ops = program.global_block().ops
             op_idxs = [all_ops.index(op) for op in ops]
@@ -159,18 +160,31 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
             if min(outputs_idxs) - max(inputs_idxs) >= 1:
                 distributed_idx = max(inputs_idxs) + 1
 
-                program.global_block()._insert_op(
-                    index=distributed_idx,
-                    type="distributed_lookup_table",
-                    inputs={"Ids": inputs,
-                            'W': w},
-                    outputs={"Outputs": outputs},
-                    attrs={
-                        "is_distributed": is_distributed,
-                        "padding_idx": padding_idx,
-                        "table_id": table_id,
-                        "lookup_table_version": op_type
-                    })
+                if use_ps_gpu:
+                    program.global_block()._insert_op(
+                        index=distributed_idx,
+                        type="pull_box_sparse",
+                        inputs={"Ids": inputs,
+                                'W': w},
+                        outputs={"Out": outputs},
+                        attrs={
+                            "size": w.shape[1],
+                            "is_distributed": True,
+                            "is_sparse": True
+                        })
+                else:
+                    program.global_block()._insert_op(
+                        index=distributed_idx,
+                        type="distributed_lookup_table",
+                        inputs={"Ids": inputs,
+                                'W': w},
+                        outputs={"Outputs": outputs},
+                        attrs={
+                            "is_distributed": is_distributed,
+                            "padding_idx": padding_idx,
+                            "table_id": table_id,
+                            "lookup_table_version": op_type
+                        })
             else:
                 for i in range(len(inputs_idxs)):
                     distributed_idx = op_idxs[i] + 1
@@ -189,7 +203,7 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
                         })
 
     pull_sparse_ops = _get_pull_sparse_ops(program)
-    _pull_sparse_fuse(program, pull_sparse_ops)
+    _pull_sparse_fuse(program, pull_sparse_ops, use_ps_gpu)
     return program
 
 
@@ -308,6 +322,54 @@ def _fake_init_sparsetable(sparse_table_names):
     return program
 
 
+def ps_gpu_pass(program):
+    def _add_push_box_sparse_op(program):
+        op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+        backward = core.op_proto_and_checker_maker.OpRole.Backward
+        for op in program.global_block().ops:
+            if op.type != "pull_box_sparse":
+                continue
+            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+                op.desc, cpt.to_text(set()), [])
+            for op_desc in grad_op_desc:
+                new_op_desc = program.global_block().desc.append_op()
+                new_op_desc.copy_from(op_desc)
+                new_op_desc._set_attr(op_role_attr_name, backward)
+
+    def _remove_lookup_table_grad_op_and_var(program):
+        lookup_table_grad_var = {}
+        remove_op_index = []
+        remove_var = []
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "lookup_table_grad":
+                for name in op.output("W@GRAD"):
+                    lookup_table_grad_var[name] = 1
+                    remove_op_index.append(idx)
+                    remove_var.append(name)
+                for name in op.input("W"):
+                    lookup_table_grad_var[name] = 1
+
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "pull_box_sparse":
+                continue
+            for key_name in op.input_names:
+                for var in op.input(key_name):
+                    if var in lookup_table_grad_var:
+                        remove_op_index.append(idx)
+                        break
+
+        remove_op_index = list(set(remove_op_index))
+        remove_op_index.sort(reverse=True)
+        for idx in remove_op_index:
+            program.global_block()._remove_op(idx)
+        for name in remove_var:
+            program.global_block()._remove_var(name)
+
+    _add_push_box_sparse_op(program)
+    _remove_lookup_table_grad_op_and_var(program)
+    return program
+
+
 def delet_extra_optimizes_pass(program, config):
     optimize_vars = []
     optimize_op_role_vars = []
diff --git a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
index 1df6b0618de8d7..cac2f7234bdf2f 100644
--- a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
+++ b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # start pserver0
 python fleet_deep_ctr.py \
     --role pserver \
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 9cca3e16de5132..cfb4b125993855 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1765,7 +1765,30 @@ def _pack_loaded_dict(load_obj):
 
 
 @static_only
-def save(program, model_path, pickle_protocol=2):
+def _legacy_save(param_dict, model_path, protocol=2):
+    def get_tensor(var):
+        if isinstance(var, core.VarBase):
+            return var.numpy()
+        elif isinstance(var, core.LoDTensor):
+            return np.array(var)
+        return var
+
+    param_dict = {name: get_tensor(param_dict[name]) for name in param_dict}
+
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
+        with open(model_path, 'wb') as f:
+            max_bytes = 2**30
+            for i in range(0, len(pickle_bytes), max_bytes):
+                f.write(pickle_bytes[i:i + max_bytes])
+    else:
+        with open(model_path, 'wb') as f:
+            pickle.dump(param_dict, f, protocol=protocol)
+
+
+@static_only
+def save(program, model_path, protocol=2, **configs):
     """
     :api_attr: Static Graph
 
@@ -1778,8 +1801,9 @@ def save(program, model_path, pickle_protocol=2):
     Args:
         program(Program) : The program to saved.
         model_path(str): the file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised
-        pickle_protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
+        protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 2
+        configs(dict, optional) : optional keyword arguments.                        
 
     Returns:
         None
@@ -1807,14 +1831,19 @@ def save(program, model_path, pickle_protocol=2):
     base_name = os.path.basename(model_path)
     assert base_name != "", \
         "The input model_path MUST be format of dirname/filename [dirname\\filename in Windows system], but received model_path is empty string."
+    if 'pickle_protocol' in configs:
+        protocol = configs['pickle_protocol']
+        warnings.warn(
+            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+        )
 
-    if not isinstance(pickle_protocol, int):
+    if not isinstance(protocol, int):
         raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(pickle_protocol)))
+            type(protocol)))
 
-    if pickle_protocol < 2 or pickle_protocol > 4:
+    if protocol < 2 or protocol > 4:
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(pickle_protocol))
+                         format(protocol))
 
     dir_name = os.path.dirname(model_path)
     if dir_name and not os.path.exists(dir_name):
@@ -1827,26 +1856,25 @@ def get_tensor(var):
     parameter_list = list(filter(is_parameter, program.list_vars()))
     param_dict = {p.name: get_tensor(p) for p in parameter_list}
 
-    param_dict = _unpack_saved_dict(param_dict, pickle_protocol)
+    param_dict = _unpack_saved_dict(param_dict, protocol)
 
-    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3.5/6'
-    if sys.platform == 'darwin' and sys.version_info.major == 3 and (
-            sys.version_info.minor == 5 or sys.version_info.minor == 6):
-        pickle_bytes = pickle.dumps(param_dict, protocol=pickle_protocol)
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
         with open(model_path + ".pdparams", 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
         with open(model_path + ".pdparams", 'wb') as f:
-            pickle.dump(param_dict, f, protocol=pickle_protocol)
+            pickle.dump(param_dict, f, protocol=protocol)
 
     optimizer_var_list = list(
         filter(is_belong_to_optimizer, program.list_vars()))
 
     opt_dict = {p.name: get_tensor(p) for p in optimizer_var_list}
     with open(model_path + ".pdopt", 'wb') as f:
-        pickle.dump(opt_dict, f, protocol=pickle_protocol)
+        pickle.dump(opt_dict, f, protocol=protocol)
 
     main_program = program.clone()
     program.desc.flush()
@@ -1857,6 +1885,17 @@ def get_tensor(var):
         f.write(program.desc.serialize_to_string())
 
 
+def _pickle_loads_mac(path, f):
+    pickle_bytes = bytearray(0)
+    file_size = os.path.getsize(path)
+    max_bytes = 2**30
+    for _ in range(0, file_size, max_bytes):
+        pickle_bytes += f.read(max_bytes)
+    load_result = pickle.loads(pickle_bytes) if six.PY2 else pickle.loads(
+        pickle_bytes, encoding='latin1')
+    return load_result
+
+
 @static_only
 def load(program, model_path, executor=None, var_list=None):
     """
@@ -2016,8 +2055,13 @@ def set_var(var, ndarray):
                                                    global_scope(),
                                                    executor._default_executor)
     with open(parameter_file_name, 'rb') as f:
-        load_dict = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+
+        # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+        if sys.platform == 'darwin' and sys.version_info.major == 3:
+            load_dict = _pickle_loads_mac(parameter_file_name, f)
+        else:
+            load_dict = pickle.load(f) if six.PY2 else pickle.load(
+                f, encoding='latin1')
         load_dict = _pack_loaded_dict(load_dict)
     for v in parameter_list:
         assert v.name in load_dict, \
@@ -2196,8 +2240,12 @@ def _load_vars_with_try_catch(exe,
         "Parameter file [{}] not exits".format(parameter_file_name)
 
     with open(parameter_file_name, 'rb') as f:
-        para_dict = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+        # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+        if sys.platform == 'darwin' and sys.version_info.major == 3:
+            para_dict = _pickle_loads_mac(parameter_file_name, f)
+        else:
+            para_dict = pickle.load(f) if six.PY2 else pickle.load(
+                f, encoding='latin1')
     para_dict = _pack_loaded_dict(para_dict)
 
     opt_file_name = model_prefix + ".pdopt"
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 96947bf72c7ddf..a68331b156b3bf 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -22,6 +22,7 @@
 from .layer_function_generator import OpProtoHolder
 
 _supported_int_dtype_ = [
+    core.VarDesc.VarType.BOOL,
     core.VarDesc.VarType.UINT8,
     core.VarDesc.VarType.INT8,
     core.VarDesc.VarType.INT16,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 6bc69ffd5cd324..e90af2a1e790c3 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1524,6 +1524,10 @@ def conv2d(input,
             not use_cudnn):
         l_type = 'depthwise_conv2d'
 
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            core.is_compiled_with_rocm()):
+        l_type = 'depthwise_conv2d'
+
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
 
@@ -9260,6 +9264,9 @@ def affine_grid(theta, out_shape, name=None):
                                  'affine_grid')
     else:
         attrs['output_shape'] = out_shape
+    if core.is_compiled_with_rocm():
+        # ROCM platform do not have MIOPEN kernel for affine_grid
+        attrs['use_cudnn'] = False
 
     helper.append_op(
         type='affine_grid',
@@ -9511,8 +9518,8 @@ def pow(x, factor=1.0, name=None):
             y_2 = fluid.layers.pow(x, factor=factor_tensor)
             # y_2 is x^{3.0}
     """
-    check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'],
-                             'pow')
+    check_variable_and_dtype(
+        x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow')
 
     helper = LayerHelper('pow', **locals())
     inputs = {'X': x}
@@ -9937,7 +9944,7 @@ def flatten(x, axis=1, name=None):
 
     Args:
         x (Variable): A tensor of rank >= axis. A tensor with type float32,
-                      float64, int8, int32, int64.
+                      float64, int8, int32, int64, uint8.
         axis (int): Indicate up to which input dimensions (exclusive) should
                     be flattened to the outer dimension of the output.
                     The value for axis must be in the range [0, R], where R
@@ -9959,14 +9966,17 @@ def flatten(x, axis=1, name=None):
 
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
             x = fluid.data(name="x", shape=[4, 4, 3], dtype="float32")
             # x shape is [4, 4, 3]
             out = fluid.layers.flatten(x=x, axis=2)
             # out shape is [16, 3]
     """
     check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten')
+        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
+        'flatten')
     helper = LayerHelper('flatten', **locals())
 
     if not (isinstance(x, Variable)):
@@ -10322,7 +10332,8 @@ def expand(x, expand_times, name=None):
     inputs = {"X": [x]}
     attrs = {}
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand')
     check_type(expand_times, 'expand_times', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == True:
         raise ValueError(
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 84f99962e84307..7458466b02fd4e 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -635,7 +635,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
             If ``shape`` is an Tensor, it should be an 1-D Tensor with date type int32 or int64.
         dtype(np.dtype|str): Data type of the output Tensor which can
-            be float16, float32, float64, int32, int64.
+            be float16, float32, float64, uint8, int32, int64.
         value(bool|float|int|Tensor): The constant value used to initialize 
             the Tensor to be created. If ``value`` is an Tensor, it should be an 1-D Tensor.
         force_cpu(bool, optional): data should be on CPU if it's true, default value is False.
@@ -673,7 +673,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     attrs = {'force_cpu': force_cpu}
     dtype = convert_dtype(dtype)
     if not isinstance(value, Variable):
-        if dtype in ['int64', 'int32']:
+        if dtype in ['uint8', 'int64', 'int32']:
             attrs['str_value'] = str(int(value))
             attrs['value'] = int(value)
         else:
@@ -686,7 +686,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             out = _varbase_creator(dtype=dtype)
 
         if isinstance(value, Variable):
-            if dtype in ['int64', 'int32']:
+            if dtype in ['uint8', 'int64', 'int32']:
                 attrs['str_value'] = str(int(value.numpy().item(0)))
             else:
                 attrs['str_value'] = str(float(value.numpy().item(0)))
@@ -706,9 +706,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
         inputs['ValueTensor'] = value
 
     check_shape(shape)
-    check_dtype(dtype, 'dtype',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'fill_constant')
+    check_dtype(
+        dtype, 'dtype',
+        ['bool', 'float16', 'float32', 'float64', 'uint8', 'int32', 'int64'],
+        'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
 
     if out is not None:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 2aa918bf806616..e1122471982314 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -3805,7 +3805,6 @@ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
         self._param_device_map = None
         self._pipeline_pair = []
         self._pp_ring_map = dict()
-        self._global_ring_id = None
 
     # insert allreduce op to sync global information for global
     # gradient clip and amp
@@ -3841,7 +3840,7 @@ def _insert_allreduce_op(self, op_idx, block):
             inputs={'X': temp_var if op.type == "reduce_any" else out_var},
             outputs={'Out': temp_var if op.type == "reduce_any" else out_var},
             attrs={
-                'ring_id': self._global_ring_id,
+                'ring_id': self.global_ring_id,
                 self._op_role_key: self._op_role.Optimize,
                 'use_calc_stream': True
             })
@@ -3887,6 +3886,16 @@ def _create_vars(self, block, ori_block):
                         reserved_x.append(input_name)
                 op.desc.set_input('X', reserved_x)
                 op.desc.set_output('Out', reserved_x)
+            elif op.type == 'check_finite_and_unscale':
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                op.desc.set_output('Out', reserved_x)
+                if len(reserved_x) == 0:
+                    block._remove_op(op_idx)
+                    op_size -= 1
+                    continue
             elif op.type == 'sum' and self._is_gradient_clip_op(op):
                 for input_name in op.desc.input("X"):
                     if block._find_var_recursive(input_name):
@@ -4020,63 +4029,38 @@ def _split_startup_program(self, startup_program, device_id):
         self._create_vars(new_startup_program.global_block(), block)
         return new_startup_program
 
-    def _find_post_op(self, ops, cur_op, var_name):
+    def _find_post_op(self, index, var_name):
         """
-        Find the real post op that has variable named var_name as input.
-
-        Args:
-            ops (list): A list of ops.
-            cur_op (Operator): Current operator which has variable named
-                               var_name as output.
-            var_name (string): Variable name.
+        Find the post op that has variable named var_name as input.
         """
-        # To skip the cast op added by amp which has no op_device set
+        # bugfix for uniform hybrid parallelism
         if '.cast_fp32' in var_name:
             var_name = var_name.replace('.cast_fp32', '')
-        elif '.cast_fp16' in var_name:
+        if '.cast_fp16' in var_name:
             var_name = var_name.replace('.cast_fp16', '')
-        post_op = []
-        before = True
-        for op in ops:
-            if op == cur_op:
-                before = False
-                continue
-            if before:
-                continue
-            for in_var_name in op.input_arg_names:
-                if in_var_name == var_name:
-                    post_op.append(op)
-                    break
-        if post_op:
-            return post_op[0]
-        return None
 
-    def _find_real_prev_op(self, ops, cur_op, var_name):
-        """
-        Find the real previous op that outputs variable named var_name.
+        post_ops = self.input_var_to_op[var_name]
+        if post_ops == None: return None
+        result_op = None
+        for post_op, post_idx in reversed(post_ops):
+            if post_idx > index:
+                result_op = post_op
+                break
+        return result_op
 
-        Args:
-            ops (list): A list of ops.
-            cur_op (Operator): Current operator which has variable named
-                               var_name as input.
-            var_name (string): Variable name.
+    def _find_prev_op(self, index, var_name):
         """
-        prev_op = []
-        for op in ops:
-            if op.type == 'send_v2' or op.type == 'recv_v2' \
-                or op.type == 'c_broadcast':
-                continue
-            if op == cur_op:
+        Find the previous op of op with index that outputs
+        variable named var_name.
+        """
+        prev_ops = self.output_var_to_op[var_name]
+        if prev_ops == None: return None
+        result_op = None
+        for prev_op, prev_idx in reversed(prev_ops):
+            if prev_idx < index:
+                result_op = prev_op
                 break
-            for out_var_name in op.output_arg_names:
-                if out_var_name == var_name:
-                    prev_op.append(op)
-        if prev_op:
-            # A op may have more than one prev op,
-            # e.g., for 'learning_rate', there may be multiple ops have it as
-            # output.
-            return prev_op[-1]
-        return None
+        return result_op
 
     def _rename_arg(self, op, old_name, new_name):
         op._rename_input(old_name, new_name)
@@ -4136,23 +4120,37 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             # For LRSched ops, we should put them on all sub-programs to
             # make sure each sub-program update the lr correctly
             op._set_attr(self._op_device_key, "gpu:all")
+        # bugfix in hybrid parallelism
+        elif op.type == "sum" and self._is_backward_op(op):
+            # For sum ops that compute the sum of @RENAMED@ vars
+            for name in op.desc.input_arg_names():
+                assert '@RENAME@' in name, \
+                    "The op must be sum used to accumulate renamed vars."
+            assert len(op.desc.output_arg_names()) == 1
+            out_name = op.desc.output_arg_names()[0]
+            post_op = self._find_post_op(idx, out_name)
+            assert post_op.has_attr(
+                'op_device'), "{} has no op_device attr for var {}".format(
+                    post_op.type, out_name)
+            device = post_op.attr(self._op_device_key)
+            assert device, "The post op must have op_device set."
+            op._set_attr(self._op_device_key, device)
         elif (op.type == "cast" or
               op.type == "scale") and self._is_backward_op(op):
-            prev_op = self._find_real_prev_op(block.ops, op,
-                                              op.desc.input("X")[0])
+            prev_op = self._find_prev_op(idx, op.desc.input("X")[0])
             op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key))
         elif op.type == "memcpy" and not self._is_optimize_op(op):
+            # for checkpoint offloading
             assert len(op.input_arg_names) == 1 and len(
                 op.output_arg_names) == 1
             input_name = op.input_arg_names[0]
             output_name = op.output_arg_names[0]
             if '@Fetch' in output_name:
-                post_op = self._find_post_op(block.ops, op, output_name)
+                post_op = self._find_post_op(idx, output_name)
                 op._set_attr(self._op_device_key,
                              post_op.attr(self._op_device_key))
             else:
-                prev_op = self._find_real_prev_op(block.ops, op,
-                                                  op.desc.input("X")[0])
+                prev_op = self._find_prev_op(idx, op.desc.input("X")[0])
                 op._set_attr(self._op_device_key,
                              prev_op.attr(self._op_device_key))
         elif self._is_loss_op(op):
@@ -4165,16 +4163,11 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             assert device, "Please put you program within device_guard scope."
             for i in range(offset):
                 block.ops[idx + i]._set_attr(self._op_device_key, device)
-        elif self._is_optimize_op(op) and op.type == "check_finite_and_unscale":
-            op_role_var = op.attr(self._op_role_var_key)
-            param_name = op_role_var[0]
-            device = self._param_device_map[param_name]
-            op._set_attr(self._op_device_key, device)
         elif self._is_optimize_op(op) and op.type == "cast":
             # For fp16-->fp32 cast added by AMP
             grad_name = op.output('Out')
             assert len(grad_name) == 1
-            param_name = grad_name[0].strip(core.grad_var_suffix())
+            param_name = self._strip_grad_suffix(grad_name[0])
             device = self._param_device_map[param_name]
             op._set_attr(self._op_device_key, device)
         elif self._is_gradient_clip_op(op) or self._is_regularization_op(op):
@@ -4197,7 +4190,11 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             op._set_attr(self._op_device_key, device)
         else:
             other_known_ops = [
-                'update_loss_scaling', 'reduce_any', 'concat', 'sum'
+                'update_loss_scaling',
+                'reduce_any',
+                'concat',
+                'sum',
+                'check_finite_and_unscale',
             ]
             assert op.type in other_known_ops, "For other ops without " \
                 "op_device set, they must be one of {}, but it " \
@@ -4274,41 +4271,78 @@ def _insert_sendrecv_ops_for_boundaries(self, block):
         Insert a pair of send and recv ops for every two
         consecutive ops on different devices.
         """
-        extra_index = 0
-
         # A map from var to device where op takes it as input,
         # avoiding multiple send and recv ops.
-        var_dev_map = dict()
+        input_var_to_device = dict()
+        # bugfix hybrid parallelism
+        first_optimize_index = None
+        for index, op in enumerate(list(block.ops)):
+            if self._is_optimize_op(op):
+                first_optimize_index = index
+                break
+        extra_index_info = {
+            'index': 0,
+            'first_optimize_index': first_optimize_index
+        }
 
         for index, op in enumerate(list(block.ops)):
             cur_device = op.attr(self._op_device_key)
             if cur_device == "gpu:all": continue
             for var_name in op.input_arg_names:
-                # i.e., lod_tensor_blocking_queue created by DataLoader,
-                # which only exists in startup program.
                 var = block.var(var_name)
-                # skip data, because we will process it later
+                # skip data var
                 if var.is_data: continue
                 prev_device = None
-                if var_name in self._param_device_map:
+                generate_ops = self.output_var_to_op.get(var_name)
+                if generate_ops is None:
+                    if var_name not in self._param_device_map:
+                        continue
                     prev_device = self._param_device_map[var_name]
-                prev_op = self._find_real_prev_op(block.ops, op, var_name)
+
+                prev_op = self._find_prev_op(index, var_name)
+
                 if not prev_device:
                     prev_device = prev_op.attr(self._op_device_key) \
                         if prev_op else None
-                if not prev_device or prev_device == 'gpu:all': continue
 
-                if prev_device != cur_device:
-                    if var_name not in var_dev_map: var_dev_map[var_name] = []
-                    if cur_device in var_dev_map[var_name]: continue
-                    var_dev_map[var_name].append(cur_device)
+                if prev_device is None or prev_device == "gpu:all": continue
+
+                if prev_device == cur_device: continue
 
-                    op_role = op.all_attrs()[self._op_role_key]
+                if var_name not in input_var_to_device:
+                    input_var_to_device[var_name] = []
+                if (cur_device, prev_device) in input_var_to_device[var_name]:
+                    continue
+
+                device_type = cur_device.split(':')[0] + ':'
+
+                def _insert_send_recv(cur_id, prev_id):
+                    cur_dev = device_type + str(cur_id)
+                    prev_dev = device_type + str(prev_id)
+                    if (cur_dev, prev_dev) in input_var_to_device[var_name]:
+                        return
+
+                    if cur_id - prev_id > 1:
+                        _insert_send_recv(cur_id - 1, prev_id)
+                        _insert_send_recv(cur_id, cur_id - 1)
+                        input_var_to_device[var_name].append(
+                            (cur_dev, prev_dev))
+                        return
+                    elif cur_id - prev_id < -1:
+                        _insert_send_recv(cur_id + 1, prev_id)
+                        _insert_send_recv(cur_id, cur_id + 1)
+                        input_var_to_device[var_name].append(
+                            (cur_dev, prev_dev))
+                        return
+
+                    assert abs(cur_id - prev_id) == 1
+                    input_var_to_device[var_name].append((cur_dev, prev_dev))
+
+                    op_role = op.attr(self._op_role_key)
                     var = block.vars[var_name]
-                    prev_device_index = int(prev_device.split(':')[1])
-                    cur_device_index = int(cur_device.split(':')[1])
-                    pair = (prev_device_index, cur_device_index)
-                    pair_key = prev_device_index * 1000 + cur_device_index
+                    pair = (prev_id, cur_id)
+                    # 1000 is just a magic number
+                    pair_key = prev_id * 1000 + cur_id
                     if pair not in self._pipeline_pair:
                         self._pipeline_pair.append(pair)
                         self._pp_ring_map[pair_key] = self.ring_id
@@ -4316,89 +4350,104 @@ def _insert_sendrecv_ops_for_boundaries(self, block):
                         self.ring_id += 1
                     else:
                         ring_id = self._pp_ring_map[pair_key]
+
                     if self.schedule_mode == 'F-then-B':  # F-then-B
-                        block._insert_op(
-                            index=index + extra_index,
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
                             type='send_v2',
                             inputs={'X': var},
                             attrs={
-                                self._op_device_key: prev_device,
+                                self._op_device_key: prev_dev,
                                 self._op_role_key: op_role,
                                 'use_calc_stream': True,
                                 'peer': 1,
                                 'ring_id': ring_id
                             })
-                        extra_index += 1
-                        block._insert_op(
-                            index=index + extra_index,
+                        extra_index_info['index'] += 1
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
                             type='recv_v2',
                             outputs={'Out': [var]},
                             attrs={
                                 'out_shape': var.shape,
                                 'dtype': var.dtype,
-                                self._op_device_key: cur_device,
+                                self._op_device_key: cur_dev,
                                 self._op_role_key: op_role,
                                 'use_calc_stream': True,
                                 'peer': 0,
                                 'ring_id': ring_id
                             })
-                        extra_index += 1
+                        extra_index_info['index'] += 1
                     elif self.schedule_mode == '1F1B':  # 1F1B
-                        block._insert_op(
-                            index=index + extra_index,
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
                             type='c_sync_calc_stream',
                             inputs={'X': [var]},
                             outputs={'Out': [var]},
                             attrs={
-                                self._op_device_key: prev_device,
+                                self._op_device_key: prev_dev,
                                 self._op_role_key: op_role,
                             })
-                        extra_index += 1
-                        block._insert_op(
-                            index=index + extra_index,
+                        extra_index_info['index'] += 1
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
                             type='send_v2',
                             inputs={'X': var},
                             attrs={
-                                self._op_device_key: prev_device,
+                                self._op_device_key: prev_dev,
                                 self._op_role_key: op_role,
                                 'use_calc_stream': False,
                                 'ring_id': ring_id,
                                 'peer': 1,
                             })
-                        extra_index += 1
-                        block._insert_op(
-                            index=index + extra_index,
+                        extra_index_info['index'] += 1
+                        insert_index = None
+                        if int(op_role) == int(self._op_role.Backward):
+                            insert_index = extra_index_info[
+                                'first_optimize_index']
+                            new_op_role = self._op_role.Optimize
+                        else:
+                            insert_index = index
+                            new_op_role = self._op_role.Backward
+                        block._insert_op_without_sync(
+                            index=insert_index + extra_index_info['index'],
                             type='c_sync_comm_stream',
                             inputs={'X': [var]},
                             outputs={'Out': [var]},
                             attrs={
-                                self._op_device_key: prev_device,
-                                self._op_role_key: self._op_role.Backward,
+                                self._op_device_key: prev_dev,
+                                self._op_role_key: new_op_role,
                                 'ring_id': ring_id,
                             })
-                        extra_index += 1
+                        if int(op_role) == int(self._op_role.Forward):
+                            extra_index_info['index'] += 1
                         var_shape = list(var.shape)
                         var_shape[0] = self.micro_batch_size if var_shape[
                             0] < 0 else var_shape[0]
-                        block._insert_op(
-                            index=index + extra_index,
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
                             type='recv_v2',
                             outputs={'Out': [var]},
                             attrs={
                                 'out_shape': var_shape,
                                 'dtype': var.dtype,
-                                self._op_device_key: cur_device,
+                                self._op_device_key: cur_dev,
                                 self._op_role_key: op_role,
                                 'use_calc_stream': True,
                                 'peer': 0,
                                 'ring_id': ring_id
                             })
-                        extra_index += 1
+                        extra_index_info['index'] += 1
                     else:
                         raise ValueError(
                             "Now only 'F-then-B' and '1F1B' are supported."
                             "The given value is {}.".format(self.schedule_mode))
 
+                _insert_send_recv(
+                    int(cur_device.split(':')[1]),
+                    int(prev_device.split(':')[1]))
+        block._sync_with_cpp()
+
     def _insert_loss_scale(self, block):
         """
         Scale the loss corresponding to number of micro-batches.
@@ -4675,6 +4724,23 @@ def _is_regularization_op(self, op):
         return op.desc.has_attr("op_namescope") \
             and op.desc.attr("op_namescope").startswith("/regularization")
 
+    def _get_input_output_info(self, block):
+        '''
+        Get info of op input and output.
+        '''
+        # A map from output var to op which generate it.
+        self.output_var_to_op = dict()
+        # A map from var to op which takes it as input.
+        self.input_var_to_op = dict()
+
+        for index, op in enumerate(list(block.ops)):
+            for var_name in op.input_arg_names:
+                ops = self.input_var_to_op.setdefault(var_name, [])
+                ops.append([op, index])
+            for var_name in op.output_arg_names:
+                ops = self.output_var_to_op.setdefault(var_name, [])
+                ops.append([op, index])
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -4682,30 +4748,35 @@ def minimize(self,
                  no_grad_set=None):
         main_block = loss.block
         self.origin_main_block = main_block
+        main_program = main_block.program
         if startup_program is None:
             startup_program = default_startup_program()
-        optimize_ops, params_grads = self._optimizer.minimize(
-            loss, startup_program, parameter_list, no_grad_set)
-        self._param_device_map = self._origin_optimizer._param_device_map
-        assert main_block.program._pipeline_opt \
-            and 'local_rank' in main_block.program._pipeline_opt, \
-            'Please use pipeline with fleet.'
-        local_rank = main_block.program._pipeline_opt['local_rank']
-        self._global_ring_id = main_block.program._pipeline_opt[
-            'global_ring_id']
-        schedule_mode = 0
-        if 'schedule_mode' in main_block.program._pipeline_opt:
-            schedule_mode = main_block.program._pipeline_opt['schedule_mode']
-        self.schedule_mode = schedule_mode
-        # micro batch size
+
+        assert main_program._pipeline_opt, 'Please use pipeline with fleet.'
+        required_keys = [
+            'local_rank',
+            'schedule_mode',
+            'micro_batch_size',
+            'ring_id',
+            'global_ring_id',
+            'use_sharding',
+        ]
+        for key in required_keys:
+            assert key in main_program._pipeline_opt, \
+                'Please use pipeline with fleet to use {}.'.format(key)
+        self.local_rank = main_block.program._pipeline_opt['local_rank']
+        self.schedule_mode = main_block.program._pipeline_opt['schedule_mode']
         self.micro_batch_size = main_block.program._pipeline_opt[
             'micro_batch_size']
-
-        self.use_sharding = False
-        if 'use_sharding' in main_block.program._pipeline_opt:
-            self.use_sharding = main_block.program._pipeline_opt['use_sharding']
+        self.use_sharding = main_block.program._pipeline_opt['use_sharding']
         self.ring_id = main_block.program._pipeline_opt['ring_id']
+        self.global_ring_id = main_block.program._pipeline_opt['global_ring_id']
+
+        optimize_ops, params_grads = self._optimizer.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+        self._param_device_map = self._origin_optimizer._param_device_map
 
+        self._get_input_output_info(main_block)
         # Step1: add default op_device attribute for ops.
         self._add_op_device_attr(main_block)
         device_list = self._check_validation(main_block)
@@ -4736,26 +4807,27 @@ def device_cmp(device1, device2):
 
         # Step4: Special Case: process persistable vars that exist in
         # multiple sections
-        self._process_persistable_vars_in_multi_sections(
-            main_program, startup_program, program_list)
+        # FIXME 
+        # self._process_persistable_vars_in_multi_sections(
+        #     main_program, startup_program, program_list)
 
         # Step5: Add sub blocks for section programs
         self._add_sub_blocks(main_block, program_list)
 
-        local_rank = main_program._pipeline_opt['local_rank'] % len(device_list)
+        self.local_rank %= len(device_list)
         place_list = []
         for dev in device_list:
             dev_index = int(dev.split(":")[1])
-            place_list.append(core.CUDAPlace(dev_index % 8))
+            place_list.append(core.CUDAPlace(0))
 
         # Step6: Split startup program
         new_startup_program = self._split_startup_program(startup_program,
-                                                          local_rank)
+                                                          self.local_rank)
 
         startup_program._pipeline_opt = {
             "startup_program": new_startup_program,
         }
-        real_block = program_list[local_rank].global_block()
+        real_block = program_list[self.local_rank].global_block()
         self._insert_loss_scale(real_block)
         if not self.use_sharding:
             # Step7: clear gradients before each mini-batch and 
@@ -4769,12 +4841,12 @@ def device_cmp(device1, device2):
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
-            "pipeline_stage": local_rank,
+            "pipeline_stage": self.local_rank,
             "num_pipeline_stages": len(device_list),
             "schedule_mode": self.schedule_mode,
             "inner_parallelism": len(device_list),
-            "section_program": program_list[local_rank],
-            "place": place_list[local_rank],
+            "section_program": program_list[self.local_rank],
+            "place": place_list[self.local_rank],
             "place_id": place_id,
             "sync_steps": -1,
             "num_microbatches": self._num_microbatches,
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 899d6ae7f0e314..d73c4e3acb9b0a 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -8,10 +8,6 @@ endforeach()
 
 add_subdirectory(unittests)
 add_subdirectory(book)
-
-# TODO: support New Custom OP on Mac
-if(NOT APPLE)
-  add_subdirectory(custom_op)
-endif()
+add_subdirectory(custom_op)
 
 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 36496ec499fd99..81f64038c7c900 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -1,6 +1,6 @@
 # New custom OP can support Windows/Linux now
-if(WITH_GPU)
-    # 'test_custom_relu_op_setup/jit' compile .cc and .cu file
+if(WITH_GPU OR APPLE) 
+    # GPU custom op tests: compile both .cc and .cu file
     py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
     py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
     py_test(test_custom_relu_model SRCS test_custom_relu_model.py)
@@ -11,8 +11,6 @@ if(WITH_GPU)
     set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
 endif()
 
-py_test(test_sysconfig SRCS test_sysconfig.py)
-
 # CPU custom op tests: only compile .cc file
 py_test(test_dispatch_jit SRCS test_dispatch_jit.py)
 py_test(test_multi_out_jit SRCS test_multi_out_jit.py)
@@ -21,41 +19,6 @@ py_test(test_custom_concat SRCS test_custom_concat.py)
 py_test(test_custom_conj SRCS test_custom_conj.py)
 
 # other tests
+py_test(test_sysconfig SRCS test_sysconfig.py)
 py_test(test_check_abi SRCS test_check_abi.py)
 cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
-
-if(NOT LINUX)
-    return()
-endif()
-
-# Old custom OP only support Linux, only run on Linux
-py_test(test_custom_op SRCS test_custom_op.py)
-py_test(test_jit_load SRCS test_jit_load.py)
-py_test(test_setup_install SRCS test_setup_install.py)
-py_test(test_setup_build SRCS test_setup_build.py)
-
-set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_install PROPERTIES TIMEOUT 250)
-set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
-
-
-if(WITH_ROCM)
-    hip_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
-elseif(WITH_GPU)
-    nv_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
-else()
-    cc_library(relu_op_shared SHARED SRCS relu_op.cc DEPS paddle_framework_shared)
-endif()
-set_target_properties(relu_op_shared PROPERTIES OUTPUT_NAME relu2_op)
-target_link_libraries(relu_op_shared ${FLUID_FRAMEWORK_SHARED_LIB})
-
-# remove the linked glog and gflags when compling relu_op_shared
-# otherwise, there is running error:
-# ERROR: something wrong with flag 'logtostderr' in file
-# 'third_party/glog/src/extern_glog/src/logging.cc'.
-# One possibility: file 'third_party/glog/src/extern_glog/src/logging.cc'
-# is being linked both statically and dynamically into this executable.
-get_target_property(TARGET_LIBRARIES relu_op_shared LINK_LIBRARIES)
-LIST(REMOVE_ITEM TARGET_LIBRARIES glog)
-LIST(REMOVE_ITEM TARGET_LIBRARIES gflags)
-set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES} )
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_setup.py b/python/paddle/fluid/tests/custom_op/custom_relu_setup.py
index 598b850c876e2f..cbc4d17a4c72b9 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_setup.py
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_setup.py
@@ -14,17 +14,21 @@
 
 import os
 
-from utils import paddle_includes, extra_compile_args
-from paddle.utils.cpp_extension import CUDAExtension, setup
+from utils import paddle_includes, extra_compile_args, IS_MAC
+from paddle.utils.cpp_extension import CUDAExtension, setup, CppExtension
+
+# Mac-CI don't support GPU
+Extension = CppExtension if IS_MAC else CUDAExtension
+sources = ['custom_relu_op.cc', 'custom_relu_op_dup.cc']
+if not IS_MAC:
+    sources.append('custom_relu_op.cu')
 
 # custom_relu_op_dup.cc is only used for multi ops test,
 # not a new op, if you want to test only one op, remove this
 # source file
 setup(
     name='custom_relu_module_setup',
-    ext_modules=CUDAExtension(  # test for not specific name here.
-        sources=[
-            'custom_relu_op.cc', 'custom_relu_op.cu', 'custom_relu_op_dup.cc'
-        ],  # test for multi ops
+    ext_modules=Extension(  # test for not specific name here.
+        sources=sources,  # test for multi ops
         include_dirs=paddle_includes,
         extra_compile_args=extra_compile_args))
diff --git a/python/paddle/fluid/tests/custom_op/relu_op.cc b/python/paddle/fluid/tests/custom_op/relu_op.cc
deleted file mode 100644
index 837f5bab6bef66..00000000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class Relu2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Y", in_dims);
-  }
-};
-
-class Relu2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddOutput("Y", "Output of relu_op");
-    AddComment(R"DOC(
-Relu2 Operator.
-)DOC");
-  }
-};
-
-class Relu2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-  }
-};
-
-template <typename T>
-class Relu2GradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("relu2_grad");
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class Relu2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < in_t->numel(); ++i) {
-      y[i] = std::max(static_cast<T>(0.), x[i]);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    for (int i = 0; i < y_t->numel(); ++i) {
-      dx[i] = dy[i] * (y[i] > static_cast<T>(0) ? 1. : 0.);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(relu2,
-                  ops::Relu2Op,
-                  ops::Relu2OpMaker,
-                  ops::Relu2GradMaker<paddle::framework::OpDesc>,
-                  ops::Relu2GradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(relu2_grad, ops::Relu2GradOp);
-REGISTER_OP_CPU_KERNEL(relu2,
-                       ops::Relu2Kernel<CPU, float>,
-                       ops::Relu2Kernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(relu2_grad,
-                       ops::Relu2GradKernel<CPU, float>,
-                       ops::Relu2GradKernel<CPU, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op.cu b/python/paddle/fluid/tests/custom_op/relu_op.cu
deleted file mode 100644
index 53ad75e413d92a..00000000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeRelu2(const T* x, const int num, T* y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<T>(0.));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = in_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu2<T><<<grid, block, 0, dev_ctx.stream()>>>(x, num, y);
-  }
-};
-
-template <typename T>
-__global__ void KeRelu2Grad(const T* y, const T* dy, const int num, T* dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu2GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = dy_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu2Grad<T><<<grid, block, 0, dev_ctx.stream()>>>(y, dy, num, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(relu2,
-                        paddle::operators::Relu2CUDAKernel<CUDA, float>,
-                        paddle::operators::Relu2CUDAKernel<CUDA, double>);
-
-REGISTER_OP_CUDA_KERNEL(relu2_grad,
-                        paddle::operators::Relu2GradCUDAKernel<CUDA, float>,
-                        paddle::operators::Relu2GradCUDAKernel<CUDA, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cc b/python/paddle/fluid/tests/custom_op/relu_op3.cc
deleted file mode 100644
index ace9598c586866..00000000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op3.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class Relu3Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Y", in_dims);
-  }
-};
-
-class Relu3OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddOutput("Y", "Output of relu_op");
-    AddComment(R"DOC(
-Relu3 Operator.
-)DOC");
-  }
-};
-
-class Relu3GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-  }
-};
-
-template <typename T>
-class Relu3GradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("relu3_grad");
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class Relu3Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < in_t->numel(); ++i) {
-      y[i] = std::max(static_cast<T>(0.), x[i]);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu3GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    for (int i = 0; i < y_t->numel(); ++i) {
-      dx[i] = dy[i] * (y[i] > static_cast<T>(0) ? 1. : 0.);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(relu3,
-                  ops::Relu3Op,
-                  ops::Relu3OpMaker,
-                  ops::Relu3GradMaker<paddle::framework::OpDesc>,
-                  ops::Relu3GradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(relu3_grad, ops::Relu3GradOp);
-REGISTER_OP_CPU_KERNEL(relu3,
-                       ops::Relu3Kernel<CPU, float>,
-                       ops::Relu3Kernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(relu3_grad,
-                       ops::Relu3GradKernel<CPU, float>,
-                       ops::Relu3GradKernel<CPU, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cu b/python/paddle/fluid/tests/custom_op/relu_op3.cu
deleted file mode 100644
index 8a229cafebb1d0..00000000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op3.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeRelu3(const T* x, const int num, T* y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<T>(0.));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu3CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = in_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu3<T><<<grid, block, 0, dev_ctx.stream()>>>(x, num, y);
-  }
-};
-
-template <typename T>
-__global__ void KeRelu3Grad(const T* y, const T* dy, const int num, T* dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu3GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = dy_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu3Grad<T><<<grid, block, 0, dev_ctx.stream()>>>(y, dy, num, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(relu3,
-                        paddle::operators::Relu3CUDAKernel<CUDA, float>,
-                        paddle::operators::Relu3CUDAKernel<CUDA, double>);
-
-REGISTER_OP_CUDA_KERNEL(relu3_grad,
-                        paddle::operators::Relu3GradCUDAKernel<CUDA, float>,
-                        paddle::operators::Relu3GradCUDAKernel<CUDA, double>);
diff --git a/python/paddle/fluid/tests/custom_op/setup_build.py b/python/paddle/fluid/tests/custom_op/setup_build.py
deleted file mode 100644
index 16a747793079e3..00000000000000
--- a/python/paddle/fluid/tests/custom_op/setup_build.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-from utils import paddle_includes, extra_compile_args
-from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-file_dir = os.path.dirname(os.path.abspath(__file__))
-
-setup(
-    name='librelu2_op_from_setup',
-    ext_modules=[
-        CUDAExtension(
-            sources=['relu_op3.cc', 'relu_op3.cu', 'relu_op.cc',
-                     'relu_op.cu'],  # test for multi ops
-            include_dirs=paddle_includes,
-            extra_compile_args=extra_compile_args)
-    ],
-    cmdclass={
-        'build_ext': BuildExtension.with_options(
-            no_python_abi_suffix=True, output_dir=file_dir)  # for unittest
-    })
diff --git a/python/paddle/fluid/tests/custom_op/test_check_abi.py b/python/paddle/fluid/tests/custom_op/test_check_abi.py
index ed2af83b2342bb..75cf99458e71ad 100644
--- a/python/paddle/fluid/tests/custom_op/test_check_abi.py
+++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -52,6 +52,8 @@ def test_compiler_version(self):
             compiler = 'g++'
         elif utils.IS_WINDOWS:
             compiler = 'cl'
+        else:
+            compiler = 'clang'
 
         # Linux: all CI gcc version > 5.4.0
         # Windows: all CI MSVC version > 19.00.24215
@@ -71,7 +73,7 @@ def test_wrong_compiler_warning(self):
             self.assertTrue(
                 "Compiler Compatibility WARNING" in str(error[0].message))
 
-    def test_exception(self):
+    def test_exception_linux(self):
         # clear environ
         self.del_environ()
         compiler = 'python'  # fake command
@@ -95,6 +97,28 @@ def fake():
             # restore
             utils._expected_compiler_current_platform = raw_func
 
+    def test_exception_mac(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'python'  # fake command
+        if utils.OS_NAME.startswith('darwin'):
+
+            def fake():
+                return [compiler]
+
+            # mock a fake function
+            raw_func = utils._expected_compiler_current_platform
+            utils._expected_compiler_current_platform = fake
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return True
+                self.assertTrue(flag)
+                # check ABI Compatibility without WARNING
+                self.assertTrue(len(error) == 0)
+
+            # restore
+            utils._expected_compiler_current_platform = raw_func
+
 
 class TestRunCMDException(unittest.TestCase):
     def test_exception(self):
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_op.py b/python/paddle/fluid/tests/custom_op/test_custom_op.py
deleted file mode 100644
index 1c0db0be154d58..00000000000000
--- a/python/paddle/fluid/tests/custom_op/test_custom_op.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import numpy as np
-import unittest
-import contextlib
-
-import paddle
-import paddle.fluid as fluid
-paddle.enable_static()
-
-
-def load_so(so_name):
-    """
-    Load .so file and parse custom op into OpInfoMap.
-    """
-    file_dir = os.path.dirname(os.path.abspath(__file__))
-    fluid.load_op_library(os.path.join(file_dir, so_name))
-
-
-from paddle.fluid.layer_helper import LayerHelper
-
-
-def relu2(x, name=None):
-    helper = LayerHelper("relu2", **locals())
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
-    helper.append_op(type="relu2", inputs={"X": x}, outputs={"Y": out})
-    return out
-
-
-@contextlib.contextmanager
-def scope_prog_guard():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
-
-
-def linear_fc(data, label, use_custom_relu):
-    hidden = fluid.layers.fc(data, size=128)
-    hidden = relu2(hidden) if use_custom_relu else fluid.layers.relu(hidden)
-    hidden = fluid.layers.fc(hidden, size=128)
-    hidden = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=hidden, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def custom_op_test(use_gpu=True, use_custom_relu=True):
-    with scope_prog_guard():
-        np.random.seed(0)
-        fluid.default_startup_program().random_seed = 10
-        fluid.default_main_program().random_seed = 10
-
-        data = fluid.layers.data(
-            name='data', shape=[1, 28, 28], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        loss = linear_fc(data, label, use_custom_relu)
-
-        optimizer = fluid.optimizer.Momentum(learning_rate=0.1, momentum=0.9)
-        optimizer.minimize(loss)
-
-        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        compile_program = fluid.compiler.CompiledProgram(
-            fluid.default_main_program()).with_data_parallel(
-                loss_name=loss.name)
-
-        reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=32)
-        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-        num = 4
-        for i, data in enumerate(reader()):
-            outs, = exe.run(compile_program,
-                            feed=feeder.feed(data),
-                            fetch_list=[loss])
-            if i == num:
-                break
-        return outs
-
-
-class CustomOpTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(2)
-
-    def test_cpu(self):
-        actual = custom_op_test(False, True)
-        expect = custom_op_test(False, False)
-        self.assertEqual(actual.all(), expect.all())
-
-    def test_gpu(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        actual = custom_op_test(True, True)
-        expect = custom_op_test(True, False)
-        self.assertEqual(actual.all(), expect.all())
-
-
-if __name__ == '__main__':
-    load_so(so_name='librelu2_op.so')
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index 1d4b2ae161eda6..db97e86385ae45 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,9 +21,9 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
-from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_MAC
 
-# Because Windows don't use docker, the shared lib already exists in the 
+# Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\custom_relu_for_model_jit\\custom_relu_for_model_jit.pyd'.format(
     get_build_directory())
@@ -35,9 +35,13 @@
 # custom_relu_op_dup.cc is only used for multi ops test,
 # not a new op, if you want to test only one op, remove this
 # source file
+source_files = ['custom_relu_op.cc']
+if not IS_MAC:
+    source_files.append('custom_relu_op.cu')
+
 custom_module = load(
     name='custom_relu_for_model_jit',
-    sources=['custom_relu_op.cc', 'custom_relu_op.cu'],
+    sources=source_files,
     extra_include_paths=paddle_includes,  # add for Coverage CI
     extra_cxx_cflags=extra_cc_args,  # test for cc flags
     extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
@@ -84,7 +88,7 @@ def setUp(self):
             for i in range(self.batch_num)
         ]
 
-        self.devices = ['cpu', 'gpu']
+        self.devices = ['cpu', 'gpu'] if not IS_MAC else ['cpu']
 
         # for saving model
         self.model_path_template = "infer_model/custom_relu_dygaph_model_{}.pdparams"
@@ -191,7 +195,7 @@ def setUp(self):
             for i in range(self.batch_num)
         ]
 
-        self.devices = ['cpu', 'gpu']
+        self.devices = ['cpu', 'gpu'] if not IS_MAC else ['cpu']
 
         # for saving model
         self.model_path_template = "infer_model/custom_relu_static_model_{}_{}"
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 641630b0f4476a..d8dcc76ac60673 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -18,10 +18,10 @@
 import numpy as np
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
-from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS, IS_MAC
 from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static
 
-# Because Windows don't use docker, the shared lib already exists in the 
+# Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
     get_build_directory())
@@ -33,11 +33,13 @@
 # custom_relu_op_dup.cc is only used for multi ops test,
 # not a new op, if you want to test only one op, remove this
 # source file
+sources = ['custom_relu_op.cc', 'custom_relu_op_dup.cc']
+if not IS_MAC:
+    sources.append('custom_relu_op.cu')
+
 custom_module = load(
     name='custom_relu_module_jit',
-    sources=[
-        'custom_relu_op.cc', 'custom_relu_op.cu', 'custom_relu_op_dup.cc'
-    ],
+    sources=sources,
     extra_include_paths=paddle_includes,  # add for Coverage CI
     extra_cxx_cflags=extra_cc_args,  # test for cc flags
     extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
@@ -112,6 +114,9 @@ def test_exception(self):
         self.assertTrue(caught_exception)
 
         caught_exception = False
+        # MAC-CI don't support GPU
+        if IS_MAC:
+            return
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
             custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 5c5c2d65a59574..b267617451772f 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/fluid/tests/custom_op/test_jit_load.py b/python/paddle/fluid/tests/custom_op/test_jit_load.py
deleted file mode 100644
index 4e6d74b7d60997..00000000000000
--- a/python/paddle/fluid/tests/custom_op/test_jit_load.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import paddle
-import numpy as np
-from paddle.utils.cpp_extension import load
-from utils import paddle_includes, extra_cc_args, extra_nvcc_args
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-# Compile and load custom op Just-In-Time.
-custom_module = load(
-    name='custom_relu2',
-    sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc', 'relu_op3.cu'],
-    extra_include_paths=paddle_includes,  # add for Coverage CI
-    extra_cxx_cflags=extra_cc_args,  # test for cc flags
-    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
-    verbose=True  # add for unittest
-)
-
-
-class TestJITLoad(unittest.TestCase):
-    def test_api(self):
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = custom_module.relu2(x)
-        out3 = custom_module.relu3(x)
-
-        self.assertTrue(np.array_equal(out.numpy(), gt_data))
-        self.assertTrue(np.array_equal(out3.numpy(), gt_data))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_setup_build.py b/python/paddle/fluid/tests/custom_op/test_setup_build.py
deleted file mode 100644
index 1ef14c2e3aaa3c..00000000000000
--- a/python/paddle/fluid/tests/custom_op/test_setup_build.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import numpy as np
-from test_custom_op import CustomOpTest, load_so
-import paddle
-from paddle.utils.cpp_extension.extension_utils import run_cmd
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-
-def compile_so():
-    """
-    Compile .so file by running setup.py config.
-    """
-    # build .so with setup.py
-    file_dir = os.path.dirname(os.path.abspath(__file__))
-    cmd = 'cd {} && python setup_build.py build'.format(file_dir)
-    run_cmd(cmd)
-
-
-# `setup.py build` only produce .so file containing multi operators.
-#  Python Interface should be added manually. `relu2` api is in `test_custom_op.py`
-def relu3(x, name=None):
-    helper = LayerHelper("relu3", **locals())
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
-    helper.append_op(type="relu3", inputs={"X": x}, outputs={"Y": out})
-    return out
-
-
-class TestCompileMultiOp(unittest.TestCase):
-    def setUp(self):
-        paddle.disable_static()
-
-    def test_relu3(self):
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = relu3(x)
-
-        self.assertTrue(
-            np.array_equal(out.numpy(),
-                           np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')))
-
-    def tearDown(self):
-        paddle.enable_static()
-
-
-if __name__ == '__main__':
-    compile_so()
-    load_so(so_name='librelu2_op_from_setup.so')
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_setup_install.py b/python/paddle/fluid/tests/custom_op/test_setup_install.py
deleted file mode 100644
index 1fd7b8a06f9523..00000000000000
--- a/python/paddle/fluid/tests/custom_op/test_setup_install.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import site
-import unittest
-import paddle
-import subprocess
-import numpy as np
-from paddle.utils.cpp_extension.extension_utils import run_cmd
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-
-class TestSetUpInstall(unittest.TestCase):
-    def setUp(self):
-        cur_dir = os.path.dirname(os.path.abspath(__file__))
-        # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && python setup_install.py install'.format(cur_dir)
-        run_cmd(cmd)
-
-        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
-        # But we simulate to pip install in current process, so interpreter don't snap
-        # sys.path has been updated. So we update it manually.
-
-        # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
-        site_dir = site.getsitepackages()[0]
-        custom_egg_path = [
-            x for x in os.listdir(site_dir) if 'custom_relu2' in x
-        ]
-        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
-            custom_egg_path)
-        sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
-
-    def test_api(self):
-        # usage: import the package directly
-        import custom_relu2
-
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = custom_relu2.relu2(x)
-        out3 = custom_relu2.relu3(x)
-
-        self.assertTrue(np.array_equal(out.numpy(), gt_data))
-        self.assertTrue(np.array_equal(out3.numpy(), gt_data))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/utils.py b/python/paddle/fluid/tests/custom_op/utils.py
index 57ce79b1f30553..2d492da3d9725f 100644
--- a/python/paddle/fluid/tests/custom_op/utils.py
+++ b/python/paddle/fluid/tests/custom_op/utils.py
@@ -13,10 +13,13 @@
 # limitations under the License.
 
 import os
+import sys
 import six
 from distutils.sysconfig import get_python_lib
 from paddle.utils.cpp_extension.extension_utils import IS_WINDOWS
 
+IS_MAC = sys.platform.startswith('darwin')
+
 site_packages_path = get_python_lib()
 # Note(Aurelius84): We use `add_test` in Cmake to config how to run unittest in CI.
 # `PYTHONPATH` will be set as `build/python/paddle` that will make no way to find
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b5c554a58cbbd2..486ad38ae296f9 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -19,6 +19,9 @@ list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_hybrid_parallel)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -26,9 +29,9 @@ list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
 list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
-list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
 list(APPEND MIXED_DIST_TEST_OPS test_c_comm_init_op)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
+list(APPEND MIXED_DIST_TEST_OPS test_communicator_ps_gpu)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
@@ -38,6 +41,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleetrun)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend)
+list(APPEND MIXED_DIST_TEST_OPS test_ascend_group)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
@@ -82,6 +87,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
@@ -159,6 +165,9 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
 elseif(WITH_GPU)
@@ -177,6 +186,7 @@ endif()
 
 if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
     list(REMOVE_ITEM TEST_OPS test_imperative_group)
+    LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
@@ -450,6 +460,7 @@ py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_stat
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS
     FLAGS_cudnn_deterministic=1)
+set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     # FIXME(typhoonzero): add these tests back
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
@@ -474,8 +485,8 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
 
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
-    py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
     py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
+    py_test_modules(test_communicator_ps_gpu MODULES test_communicator_ps_gpu ENVS ${dist_ENVS})
     py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS ${dist_ENVS})
     py_test_modules(test_communicator_half_async MODULES test_communicator_half_async ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
     py_test_modules(test_communicator_sync MODULES test_communicator_sync ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
@@ -524,6 +535,10 @@ if(WITH_DISTRIBUTE)
         bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        if(WITH_ASCEND OR WITH_ASCEND_CL)
+            bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        endif()
 
         # port range (20000, 23000) is reserved for dist-ops
         set(dist_ut_port 20001)
@@ -534,7 +549,9 @@ if(WITH_DISTRIBUTE)
                 message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
             endif()
         endforeach(TEST_OP)
-        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        # solve it later.
+        # bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
     endif(NOT APPLE)
 endif()
 
@@ -610,6 +627,10 @@ if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
 
+if (WITH_ASCEND_CL)
+    add_subdirectory(npu)
+endif()
+
 if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
@@ -713,12 +734,13 @@ set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bicubic_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
+set_tests_properties(test_pylayer_op PROPERTIES TIMEOUT 120)
 if (WIN32)
     set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
     set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
 else()
     set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
-    set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
 endif()
 set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 120)
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
@@ -786,7 +808,7 @@ set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eager_deletion_gru_net PROPERTIES TIMEOUT 120)
-set_tests_properties(test_activation_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_activation_op PROPERTIES TIMEOUT 180)
 set_tests_properties(test_normal PROPERTIES TIMEOUT 120)
 set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
@@ -820,16 +842,20 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
+    set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_hybrid_parallel PROPERTIES TIMEOUT 120 LABELS "RUN_TYPE=DIST")
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
     endif()
 endif()
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
     if(WITH_DISTRIBUTE)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
     endif()
@@ -852,6 +878,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         test_collective_barrier_api
         test_collective_reduce_api
         test_collective_allreduce_api
+        test_new_group_api
         test_collective_broadcast_api
         test_collective_allgather_api
         PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py
new file mode 100644
index 00000000000000..851544e165980a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ascend_group.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import paddle.fluid as fluid
+from paddle.fluid import unique_name
+import paddle.fluid.core as core
+import paddle
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_optimizers.ascend import ascend_parser, ascend_optimizer
+from collections import namedtuple
+
+Block = namedtuple('Block', ['program'])
+Loss = namedtuple('Loss', ['block'])
+
+paddle.enable_static()
+
+OpRole = core.op_proto_and_checker_maker.OpRole
+OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+OP_ROLE_VAR_KEY = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+
+role = fleet.PaddleCloudRoleMaker(is_collective=True)
+fleet.init(role)
+
+
+def init_communicator(startup_program, main_program, current_endpoint,
+                      endpoints, ring_id):
+    nranks = len(endpoints)
+    other_endpoints = endpoints[:]
+    other_endpoints.remove(current_endpoint)
+    group_rank = endpoints.index(current_endpoint)
+    assert group_rank >= 0
+
+    block = startup_program.global_block()
+    nccl_id_var = block.create_var(
+        name=unique_name.generate('nccl_id'),
+        persistable=True,
+        type=core.VarDesc.VarType.RAW)
+    block.append_op(
+        type='c_gen_nccl_id',
+        inputs={},
+        outputs={'Out': nccl_id_var},
+        attrs={
+            'rank': group_rank,
+            'endpoint': current_endpoint,
+            'other_endpoints': other_endpoints,
+            OP_ROLE_KEY: OpRole.Forward,
+        })
+    block.append_op(
+        type='c_comm_init',
+        inputs={'X': nccl_id_var},
+        outputs={},
+        attrs={
+            'nranks': nranks,
+            'rank': group_rank,
+            'ring_id': ring_id,
+            OP_ROLE_KEY: OpRole.Forward,
+        })
+
+    # add input op for test
+    fill_var_name = "tensor@Filled"
+    fill_var = block.create_var(
+        name=fill_var_name,
+        shape=[10, 10],
+        dtype='float32',
+        persistable=False,
+        stop_gradient=True)
+    block.append_op(
+        type="fill_constant",
+        outputs={"Out": fill_var_name},
+        attrs={
+            "shape": [10, 10],
+            "dtype": fill_var.dtype,
+            "value": 1.0,
+            "place_type": 1
+        })
+
+    with fluid.program_guard(main_program):
+        op_type = "c_allreduce_sum"
+        data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5)
+        helper = LayerHelper(op_type, **locals())
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [data]},
+            outputs={'Out': [data]},
+            attrs={'ring_id': ring_id,
+                   'use_calc_stream': True})
+
+    print("startup program:", startup_program)
+    print("main program:", main_program)
+
+
+def train(world_endpoints, world_device_ids, local_device_ids, local_rank):
+    startup_programs = []
+    main_programs = []
+
+    #trainer_endpoints=["127.0.0.1:6071","127.0.0.1:6072","127.0.0.1:6073","127.0.0.1:6074"]
+    trainer_endpoints = world_endpoints
+    groups = [[], [], []]
+    groups[0] = [trainer_endpoints[0], trainer_endpoints[1]]
+    groups[1] = [trainer_endpoints[2], trainer_endpoints[3]]
+    groups[2] = [trainer_endpoints[0], trainer_endpoints[2]]
+    print("groups:", groups)
+
+    for i in range(len(trainer_endpoints)):
+        startup_programs.append(fluid.Program())
+        main_programs.append(fluid.Program())
+
+    for idx, group in enumerate(groups):
+        for te in group:
+            te_idx = trainer_endpoints.index(te)
+            startup_program = startup_programs[te_idx]
+            main_program = main_programs[te_idx]
+            init_communicator(startup_program, main_program, te, group, idx)
+
+    print(len(startup_programs))
+    print(startup_programs[local_rank])
+    print(main_programs[local_rank])
+
+    print("local rank: ", local_rank)
+    print("local startup program: ", startup_programs[local_rank])
+
+    startup_program = startup_programs[local_rank]
+    main_program = main_programs[local_rank]
+    loss = Loss(Block(main_program))
+    optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[])
+    optimizer.minimize(
+        loss,
+        startup_program,
+        auto_dp=True,
+        rank_table_file=os.getenv("RANK_TABLE_FILE", None))
+
+    exe = paddle.static.Executor(paddle.CPUPlace())
+    exe.run(startup_program)
+    exe.run(main_program)
+
+
+worker_endpoints = fleet.worker_endpoints()
+world_device_ids = fleet.world_device_ids()
+local_device_ids = fleet.local_device_ids()
+local_rank = int(fleet.local_rank())
+
+print("worker_endpoints:", worker_endpoints)
+print("world_device_ids:", world_device_ids)
+print("local_device_ids:", local_device_ids)
+print("local_rank:", local_rank)
+
+train(worker_endpoints, world_device_ids, local_device_ids, local_rank)
diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
new file mode 100644
index 00000000000000..bb2180a733f818
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+
+
+def train(prefix):
+    selected_accelerators = os.getenv("FLAGS_selected_accelerators")
+    selected_npus = os.getenv("FLAGS_selected_npus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+    device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
+    current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
+
+    details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
+            .format(selected_accelerators, selected_npus, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
+
+    print(details)
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
+        f.write(details)
+
+
+if __name__ == '__main__':
+    prefix = sys.argv[1]
+    train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
new file mode 100644
index 00000000000000..597765cfb9811c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceNewGroupAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            gp = paddle.distributed.new_group([0, 1])
+            paddle.distributed.all_reduce(
+                tindata, group=gp, use_calc_stream=False)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceNewGroupAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
index 22c930bf8948aa..676b15c0d93e76 100755
--- a/python/paddle/fluid/tests/unittests/dist_sharding_save.py
+++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
@@ -59,7 +59,11 @@ def runtime_main():
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.sharding = True
-            strategy.sharding_configs = {"fuse_broadcast_MB": 0.2}
+            strategy.sharding_configs = {
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 0.2,
+                "sharding_degree": 2,
+            }
 
             optimizer = paddle.fluid.optimizer.Momentum(
                 learning_rate=0.01, momentum=0.9)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index 3a7994ee67e9bc..dbd3952991cfd7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -241,5 +241,39 @@ def test_ast_to_func(self):
                                                                static_result))
 
 
+class TestDictCmpInFor(unittest.TestCase):
+    def test_with_for(self):
+        def func():
+            pos = [1, 3]
+            neg = [-1, -3]
+            dict_val = {'minus': 0}
+            # test `zip` with `for`
+            for (x, y) in zip(pos, neg):
+                val = x - y
+                dict_val.update(
+                    {k: val + dict_val[k]
+                     for k, v in dict_val.items()})
+
+            return dict_val
+
+        self.assertEqual(paddle.jit.to_static(func)()['minus'], 8)
+
+    def test_with_for_enumerate(self):
+        def func():
+            pos = [1, 3]
+            neg = [-1, -3]
+            dict_val = {'minus': 0}
+            # test `zip` with `for`
+            for i, (x, y) in enumerate(zip(pos, neg)):
+                val = x - y
+                dict_val.update(
+                    {k: val + dict_val[k]
+                     for k, v in dict_val.items()})
+
+            return dict_val
+
+        self.assertEqual(paddle.jit.to_static(func)()['minus'], 8)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 419150345b8f4c..5db1bb2a384f58 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -342,5 +342,28 @@ def init_net(self):
         self.Net = DiffModeNet2
 
 
+class TestNewVarCreateInOneBranch(unittest.TestCase):
+    def test_var_used_in_another_for(self):
+        def case_func(training):
+            # targets and targets_list is dynamically defined by training
+            if training:
+                targets = [1, 2, 3]
+                targets_list = [targets]
+
+            num_step = 3
+            for i in range(num_step):
+                if i > 0:
+                    rois, rosi_num = 1, 2
+                    # targets is in loop_vars.
+                    if training:
+                        ros, rosi_num, targets = -1, -2, [-1, -2, -3]
+                        targets_list.append(targets)
+
+            return rosi_num
+
+        self.assertEqual(paddle.jit.to_static(case_func)(False), 2)
+        self.assertEqual(paddle.jit.to_static(case_func)(True), -2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index 1c74a11cc4d2e6..730fa4ca60d31e 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -50,6 +50,38 @@ def net(self, main_prog, startup_prog):
                 strategy = paddle.distributed.fleet.DistributedStrategy()
         return avg_cost, strategy
 
+    def pp_net(self, main_prog, startup_prog, pp_degree=2):
+        def fc_block(input_x):
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            fc_3 = paddle.fluid.layers.fc(input=fc_2, size=64, act='tanh')
+            return fc_3
+
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                with fluid.device_guard("gpu:0"):
+                    input_x = paddle.fluid.layers.data(
+                        name="x", shape=[32], dtype='float32')
+                    input_y = paddle.fluid.layers.data(
+                        name="y", shape=[1], dtype='int64')
+
+                for stage_idx in range(pp_degree):
+                    with fluid.device_guard("gpu:" + str(stage_idx)):
+                        input_x = fc_block(input_x)
+
+                with fluid.device_guard("gpu:" + str(pp_degree - 1)):
+                    prediction = paddle.fluid.layers.fc(input=[input_x],
+                                                        size=2,
+                                                        act='softmax')
+                    cost = paddle.fluid.layers.cross_entropy(
+                        input=prediction, label=input_y)
+                    avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        return avg_cost, strategy
+
     def optimizer(self,
                   loss,
                   strategy,
@@ -146,7 +178,11 @@ def set_strategy(self, strategy, name):
             strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
         elif name == "sharding":
             strategy.sharding = True
-            strategy.sharding_configs = {"fuse_broadcast_MB": 0.2}
+            strategy.sharding_configs = {
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 0.2,
+                "sharding_degree": 2,
+            }
         elif name == "recompute-offload":
             strategy.recompute = True
             strategy.recompute_configs = {
diff --git a/python/paddle/fluid/tests/unittests/hccl_tools.py b/python/paddle/fluid/tests/unittests/hccl_tools.py
new file mode 100644
index 00000000000000..3ae8f38dc64bd1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hccl_tools.py
@@ -0,0 +1,174 @@
+# -*- coding:UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""generate hccl config file script"""
+import os
+import sys
+import json
+import socket
+from argparse import ArgumentParser
+from typing import Dict, Any
+
+
+def parse_args():
+    """
+    parse args .
+
+    Args:
+
+    Returns:
+        args.
+
+    Examples:
+        >>> parse_args()
+    """
+    parser = ArgumentParser(description="mindspore distributed training launch "
+                            "helper utilty that will generate hccl"
+                            " config file")
+    parser.add_argument(
+        "--device_num",
+        type=str,
+        default="[0,8)",
+        help="The number of the Ascend accelerators used. please note that the Ascend accelerators"
+        "used must be continuous, such [0,4) means to use four chips "
+        "0，1，2，3; [0,1) means to use chip 0; The first four chips are"
+        "a group, and the last four chips are a group. In addition to"
+        "the [0,8) chips are allowed, other cross-group such as [3,6)"
+        "are prohibited.")
+    parser.add_argument(
+        "--visible_devices",
+        type=str,
+        default="0,1,2,3,4,5,6,7",
+        help="will use the visible devices sequentially")
+    parser.add_argument("--server_ip", type=str, default="", help="server ip")
+    args = parser.parse_args()
+    return args
+
+
+def get_host_ip():
+    """
+    get host ip
+    """
+    ip = None
+
+    try:
+        hostname = socket.gethostname()
+        ip = socket.gethostbyname(hostname)
+    except EOFError:
+        pass
+
+    return ip
+
+
+def main():
+    print("start", __file__)
+    args = parse_args()
+
+    # visible_devices
+    visible_devices = args.visible_devices.split(',')
+    print('visible_devices:{}'.format(visible_devices))
+
+    # server_id
+    ip = get_host_ip()
+    if args.server_ip:
+        server_id = args.server_ip
+    elif ip:
+        server_id = ip
+    else:
+        raise ValueError("please input server ip!")
+    print('server_id:{}'.format(server_id))
+
+    # device_num
+    first_num = int(args.device_num[1])
+    last_num = int(args.device_num[3])
+    if first_num < 0 or last_num > 8:
+        raise ValueError("device num {} must be in range [0,8] !".format(
+            args.device_num))
+    if first_num > last_num:
+        raise ValueError(
+            "First num {} of device num {} must less than last num {} !".format(
+                first_num, args.device_num, last_num))
+    if first_num < 4:
+        if last_num > 4:
+            if first_num == 0 and last_num == 8:
+                pass
+            else:
+                raise ValueError(
+                    "device num {} must be in the same group of [0,4] or [4,8] !".
+                    format(args.device_num))
+
+    device_num_list = list(range(first_num, last_num))
+    print("device_num_list:", device_num_list)
+
+    assert len(visible_devices) >= len(device_num_list)
+
+    # construct hccn_table
+    device_ips = {}
+    with open('/etc/hccn.conf', 'r') as fin:
+        for hccn_item in fin.readlines():
+            if hccn_item.strip().startswith('address_'):
+                device_id, device_ip = hccn_item.split('=')
+                device_id = device_id.split('_')[1]
+                device_ips[device_id] = device_ip.strip()
+
+    hccn_table = {'version': '1.0', 'server_count': '1', 'server_list': []}
+    device_list = []
+    rank_id = 0
+    for instance_id in device_num_list:
+        device_id = visible_devices[instance_id]
+        device_ip = device_ips[device_id]
+        device = {
+            'device_id': device_id,
+            'device_ip': device_ip,
+            'rank_id': str(rank_id)
+        }
+        print('rank_id:{}, device_id:{}, device_ip:{}'.format(
+            rank_id, device_id, device_ip))
+        rank_id += 1
+        device_list.append(device)
+    hccn_table['server_list'].append({
+        'server_id': server_id,
+        'device': device_list,
+        'host_nic_ip': 'reserve'
+    })
+    hccn_table['status'] = 'completed'
+
+    # save hccn_table to file
+    table_path = os.getcwd()
+    table_fn = os.path.join(table_path, 'hccl_{}p_{}_{}.json'.format(
+        len(device_num_list), "".join(map(str, device_num_list)), server_id))
+    with open(table_fn, 'w') as table_fp:
+        json.dump(hccn_table, table_fp, indent=4)
+    sys.stdout.flush()
+    print("Completed: hccl file was save in :", table_fn)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
new file mode 100644
index 00000000000000..0a9785475b561a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+from paddle.distributed import fleet
+
+
+class TestNewGroupAPI(object):
+    def __init__(self):
+        paddle.distributed.init_parallel_env()
+        topo = fleet.CommunicateTopology(["data", "model", "pipe"], [2, 1, 1])
+        self.hcg = fleet.HybridCommunicateGroup(topo)
+
+        d1 = np.array([1, 2, 3])
+        d2 = np.array([2, 3, 4])
+        self.tensor1 = paddle.to_tensor(d1)
+        self.tensor2 = paddle.to_tensor(d2)
+
+    def test_all(self):
+        topo = self.hcg.topology()
+        global_rank = self.hcg.get_data_parallel_rank()
+
+        dp_rank = self.hcg.get_data_parallel_rank()
+        dp_gp = self.hcg.get_data_parallel_group()
+        dp_world_size = self.hcg.get_data_parallel_world_size()
+        dp_src_rank = self.hcg.get_data_parallel_group_src_rank()
+        np.testing.assert_array_equal(dp_world_size, 2)
+        np.testing.assert_array_equal(dp_src_rank, 0)
+
+        mp_rank = self.hcg.get_model_parallel_rank()
+        mp_gp = self.hcg.get_model_parallel_group()
+        mp_world_size = self.hcg.get_model_parallel_world_size()
+        mp_src_rank = self.hcg.get_model_parallel_group_src_rank()
+        np.testing.assert_array_equal(mp_world_size, 1)
+
+        tmp = np.array([0, 0, 0])
+        result = paddle.to_tensor(tmp)
+        paddle.distributed.scatter(
+            result, [self.tensor2, self.tensor1],
+            src=dp_src_rank,
+            group=dp_gp,
+            use_calc_stream=True)
+        if dp_rank == 0:
+            assert np.array_equal(result, self.tensor2)
+        elif dp_rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test scatter api ok")
+
+        paddle.distributed.broadcast(
+            result, src=1, group=dp_gp, use_calc_stream=True)
+        assert np.array_equal(result, self.tensor1)
+        print("test broadcast api ok")
+
+        paddle.distributed.reduce(
+            result, dst=dp_src_rank, group=dp_gp, use_calc_stream=True)
+        if dp_rank == 0:
+            assert np.array_equal(result,
+                                  paddle.add(self.tensor1, self.tensor1))
+        elif dp_rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test reduce api ok")
+
+        paddle.distributed.all_reduce(result, use_calc_stream=True)
+        assert np.array_equal(
+            result,
+            paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
+        print("test all_reduce api ok")
+
+        paddle.distributed.wait(result, dp_gp, use_calc_stream=True)
+        paddle.distributed.wait(result, dp_gp, use_calc_stream=False)
+        print("test wait api ok")
+
+        result = []
+        paddle.distributed.all_gather(
+            result, self.tensor1, group=dp_gp, use_calc_stream=True)
+        assert np.array_equal(result[0], self.tensor1)
+        assert np.array_equal(result[1], self.tensor1)
+        print("test all_gather api ok")
+
+        paddle.distributed.barrier(group=dp_gp)
+        print("test barrier api ok")
+
+        return
+
+
+if __name__ == "__main__":
+    gpt = TestNewGroupAPI()
+    gpt.test_all()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
new file mode 100644
index 00000000000000..ed5b9060e5eba9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle import framework
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+    fleet.meta_parallel.model_parallel_random_seed(seed)
+
+
+class ColumnLinearNet(fluid.dygraph.Layer):
+    def __init__(self, input_size, output_size, global_dtype):
+        super(ColumnLinearNet, self).__init__()
+        self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
+            in_features=input_size,
+            out_features=output_size,
+            weight_attr=None,
+            has_bias=True,
+            gather_output=True,
+            name="test_column_linear")
+
+    def forward(self, x):
+        output = self.parallel_linear(x)
+        return output
+
+
+class RowLinearNet(fluid.dygraph.Layer):
+    def __init__(self, input_size, output_size):
+        super(RowLinearNet, self).__init__()
+        self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
+            in_features=input_size,
+            out_features=output_size,
+            has_bias=True,
+            input_is_parallel=False,
+            name="test_row_linear")
+
+    def forward(self, x):
+        output = self.parallel_linear(x)
+        return output
+
+
+class EmbeddingNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size):
+        super(EmbeddingNet, self).__init__()
+        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(vocab_size,
+                                                                    hidden_size)
+
+    def forward(self, x):
+        output = self.embedding(x)
+        return output
+
+
+class SimpleMatmul(fluid.dygraph.Layer):
+    def __init__(self, weight, output_size, global_dtype):
+        super(SimpleMatmul, self).__init__()
+        self.weight = paddle.create_parameter(
+            shape=weight.shape,
+            dtype=global_dtype,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(weight)))
+        self.bias = self.create_parameter(
+            shape=[output_size],
+            dtype=global_dtype,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+    def forward(self, x):
+        output = paddle.matmul(x, self.weight) + self.bias
+        return output
+
+
+class SimpleEmbedding(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, weight):
+        super(SimpleEmbedding, self).__init__()
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                name="origin_embedding",
+                initializer=paddle.nn.initializer.Assign(weight)))
+
+    def forward(self, x):
+        output = self.embedding(x)
+        return output
+
+
+class TestDistTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_column_parallel_layer(self):
+        set_random_seed(1024)
+        global_dtype = "float32"
+
+        input_size_per_card = 17
+        input_size = input_size_per_card * self.model_parallel_size
+        output_size_per_card = 13
+        output_size = output_size_per_card * self.model_parallel_size
+        batch_size = 4
+
+        model_a = ColumnLinearNet(input_size, output_size, global_dtype)
+
+        # get w
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.parallel_linear.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        integral_w = paddle.concat(integral_w, axis=1)
+
+        model_b = SimpleMatmul(integral_w, output_size, global_dtype)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+        for idx in range(5):
+            input = paddle.randn([batch_size, input_size], global_dtype)
+            input.stop_gradient = True
+
+            output_a = model_a(input)
+            loss_a = output_a.mean()
+            loss_a.backward()
+
+            output_b = model_b(input)
+            loss_b = output_b.mean()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+
+            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
+
+    def test_row_parallel_layer(self):
+        global_dtype = "float32"
+        paddle.set_default_dtype(global_dtype)
+        set_random_seed(1024)
+
+        self.hcg = fleet.get_hybrid_communicate_group()
+
+        self.word_size = self.hcg.get_model_parallel_world_size()
+        self.rank_id = self.hcg.get_model_parallel_rank()
+
+        input_size_per_card = 17
+        input_size = input_size_per_card * self.model_parallel_size
+        output_size_per_card = 13
+        output_size = output_size_per_card * self.model_parallel_size
+        batch_size = 4
+
+        model_a = RowLinearNet(input_size, output_size)
+
+        # get w
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.parallel_linear.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        integral_w = paddle.concat(integral_w, axis=0)
+
+        model_b = SimpleMatmul(integral_w, output_size, global_dtype)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+
+        for idx in range(5):
+            input = paddle.randn([batch_size, input_size], global_dtype)
+            input.stop_gradient = True
+
+            output_a = model_a(input)
+            loss_a = output_a.mean()
+            loss_a.backward()
+
+            output_b = model_b(input)
+            loss_b = output_b.mean()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+
+    def test_parallel_embedding(self):
+        batch_size = 17
+        seq_length = 23
+        vocab_size_per_card = 2
+        vocab_size = vocab_size_per_card * self.model_parallel_size
+        hidden_size = 2
+        seed = 1236
+
+        set_random_seed(seed)
+        rank_id = dist.get_rank()
+
+        # model_a
+        model_a = EmbeddingNet(vocab_size, hidden_size)
+
+        # model_b
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.embedding.embedding.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        result_w = []
+        for idx in range(len(integral_w)):
+            tmp = paddle.gather(
+                integral_w[idx],
+                paddle.to_tensor(list(range(vocab_size_per_card))))
+            result_w.append(tmp)
+        integral_w = paddle.concat(result_w, axis=0)
+
+        model_b = SimpleEmbedding(vocab_size, hidden_size, integral_w)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+
+        for _ in range(5):
+            np_input_data = np.random.randint(0, vocab_size,
+                                              (batch_size, seq_length))
+            input_data = paddle.to_tensor(np_input_data, dtype="int32")
+
+            output_a = model_a(input_data)
+            loss_a = output_a.mean()
+
+            output_b = model_b(input_data)
+            loss_b = output_b.mean()
+
+            loss_a.backward()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
new file mode 100644
index 00000000000000..59d24066946aa6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+import random
+
+
+class TestDistTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_cuda_rng_tracker(self):
+        seed_1 = 2021
+        seed_2 = 1024
+
+        size = [20, 15]
+
+        paddle.seed(seed_1)
+        target_11 = paddle.randn(size, "float32")
+        target_12 = paddle.randn(size, "float32")
+
+        paddle.seed(seed_2)
+        target_21 = paddle.randn(size, "float32")
+        target_22 = paddle.randn(size, "float32")
+
+        paddle.seed(seed_1)
+
+        fleet.meta_parallel.get_rng_state_tracker().add("test", seed_2)
+
+        result_11 = paddle.randn(size, "float32")
+
+        with fleet.meta_parallel.get_rng_state_tracker().rng_state("test"):
+            result_21 = paddle.randn(size, "float32")
+
+        result_12 = paddle.randn(size, "float32")
+
+        with fleet.meta_parallel.get_rng_state_tracker().rng_state("test"):
+            result_22 = paddle.randn(size, "float32")
+
+        np.testing.assert_allclose(result_11.numpy(), target_11.numpy())
+        np.testing.assert_allclose(result_12.numpy(), target_12.numpy())
+        np.testing.assert_allclose(result_21.numpy(), target_21.numpy())
+        np.testing.assert_allclose(result_22.numpy(), target_22.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index dfec1cc7572bea..8e4c091cd01dd3 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -32,4 +32,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
index 95cff4de6f6b08..69a9ae3c0ad2c9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
@@ -40,9 +40,11 @@ def setUp(self):
             matmul_ab_square = paddle.square(matmul_ab)
             matmul_square_ab = paddle.matmul(data_a_square, data_b_square)
 
-            scale = paddle.fluid.layers.fill_constant(shape=[1], value=0.5, dtype='float32')
+            scale = paddle.fluid.layers.fill_constant(
+                shape=[1], value=0.5, dtype='float32')
 
-            sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square, matmul_square_ab)
+            sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square,
+                                                          matmul_square_ab)
             squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale)
 
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
new file mode 100644
index 00000000000000..1d6f1c2c45910d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTAnchorGeneratorBaseTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 16
+        self.height = 32
+        self.width = 32
+        self.anchor_sizes = [64., 128., 256., 512.]
+        self.aspect_ratios = [.5, 1., 2.]
+        self.variance = [.1, .1, .2, .2]
+        self.stride = [8., 8.]
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+        self.feeds = {
+            'data':
+            np.random.random([self.bs, self.channel, self.height,
+                              self.width]).astype('float32'),
+        }
+
+    def build(self):
+        min_graph_size = 3 if self.dynamic_shape_params is not None else 2
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, min_graph_size, self.precision, self.serialize,
+            False)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.height, self.width],
+                dtype='float32')
+            anchor, var = fluid.layers.detection.anchor_generator(
+                data,
+                anchor_sizes=self.anchor_sizes,
+                aspect_ratios=self.aspect_ratios,
+                variance=self.variance,
+                stride=self.stride)
+            if self.dynamic_shape_params is not None:
+                anchor = fluid.layers.transpose(anchor, [2, 3, 0, 1])
+            out = fluid.layers.batch_norm(anchor, is_test=True)
+
+        self.fetch_list = [out, var]
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def set_dynamic(self):
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'data': [self.bs, self.channel, self.height // 2, self.width // 2]
+        }, {
+            'data': [self.bs, self.channel, self.height, self.width]
+        }, {'data': [self.bs, self.channel, self.height, self.width]}, False)
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_serialize(self):
+        self.serialize = True
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16_serialize(self):
+        self.serialize = True
+        self.precision = AnalysisConfig.Precision.Half
+        self.set_dynamic()
+        self.run_test()
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 1e-3
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
new file mode 100644
index 00000000000000..fd69a8bf6c37fa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTDynamicShapeTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 16, 16], dtype="float32")
+            out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=1,
+                padding=[1, 1],
+                bias_attr=False,
+                act=None)
+
+        self.feeds = self.set_feeds()
+        self.enable_trt = True
+        self.trt_parameters = TRTDynamicShapeTest.TensorRTParam(
+            1 << 30, 1, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = TRTDynamicShapeTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [1, 3, 32, 32]}, {'data': [1, 3, 16, 16]}, False)
+        self.fetch_list = [out]
+
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 16, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound1Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 64, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound2Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([2, 3, 16, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound3Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 4, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 94434f40434489..080d1ccc9054bc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -25,19 +25,16 @@ class TensorRTMatMulDims2Test(InferencePassTest):
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[24, 24], dtype="float32")
+            data = fluid.data(name="data", shape=[24, 24], dtype="float32")
             matmul_out = fluid.layers.matmul(
                 x=data,
                 y=data,
-                transpose_x = self.transpose_x,
-                transpose_y = self.transpose_y,
-                alpha = self.alpha)
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {
-            "data": np.ones([24, 24]).astype("float32"),
-        }
+        self.feeds = {"data": np.ones([24, 24]).astype("float32"), }
         self.enable_trt = True
         self.trt_parameters = TensorRTMatMulDims2Test.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
@@ -65,14 +62,12 @@ def setUp(self):
             matmul_out = fluid.layers.matmul(
                 x=data,
                 y=data,
-                transpose_x = self.transpose_x,
-                transpose_y = self.transpose_y,
-                alpha = self.alpha)
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {
-            "data": np.ones([1, 6, 24, 24]).astype("float32"),
-        }
+        self.feeds = {"data": np.ones([1, 6, 24, 24]).astype("float32"), }
         self.enable_trt = True
         self.trt_parameters = TensorRTMatMulTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
index 67a1253b2cd02e..4530e04d8de63a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
@@ -48,5 +48,33 @@ def test_check_output(self):
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
+class TRTScaleShape2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 512, 512], dtype="float32")
+            scale_out = self.append_scale(data)
+            out = fluid.layers.batch_norm(scale_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 512, 512]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTScaleShape2Test.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def append_scale(self, data):
+        return fluid.layers.scale(
+            x=data, scale=2.0, bias=-1.0, bias_after_scale=False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
new file mode 100644
index 00000000000000..cff8091cd93f8e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTYoloBoxTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            image_shape = [self.bs, self.channel, self.height, self.width]
+            image = fluid.data(name='image', shape=image_shape, dtype='float32')
+            image_size = fluid.data(
+                name='image_size', shape=[self.bs, 2], dtype='int32')
+            boxes, scores = self.append_yolobox(image, image_size)
+            scores = fluid.layers.reshape(scores, (self.bs, -1))
+            out = fluid.layers.batch_norm(scores, is_test=True)
+
+        self.feeds = {
+            'image': np.random.random(image_shape).astype('float32'),
+            'image_size': np.random.randint(
+                32, 64, size=(self.bs, 2)).astype('int32'),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTYoloBoxTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out, boxes]
+
+    def set_params(self):
+        self.bs = 4
+        self.channel = 255
+        self.height = 64
+        self.width = 64
+        self.class_num = 80
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        self.conf_thresh = .1
+        self.downsample_ratio = 32
+
+    def append_yolobox(self, image, image_size):
+        return fluid.layers.yolo_box(
+            x=image,
+            img_size=image_size,
+            class_num=self.class_num,
+            anchors=self.anchors,
+            conf_thresh=self.conf_thresh,
+            downsample_ratio=self.downsample_ratio)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
new file mode 100644
index 00000000000000..93dc45f2650f53
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
@@ -0,0 +1,153 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION, fusion_lstm
+
+
+class TestFusionLSTMINT8MKLDNNOp(OpTest):
+    def set_confs(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "fusion_lstm"
+        self.lod = [[2, 3, 5, 4]]
+        self.IC = 3
+        self.OC = 5
+        self.is_reverse = False
+        self.has_initial_state = False
+        self.act_cell = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.act_cand = 'tanh'
+        self.use_peepholes = False  # LSTM u8 doesn't support peepholes
+        self.use_mkldnn = True
+        self.force_fp32_output = False
+        self.error_margin = 1e-5
+        self.set_confs()
+
+        # RNN dimensions
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        # Input data
+        x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
+        scale_data = 63.0
+        shift_data = 64.0
+        x_u8 = np.rint(x_f32 * scale_data + shift_data).astype(np.uint8)
+
+        # WeightX/WeightH data
+        wx = np.random.rand(self.IC, 4 * self.OC).astype('float32') * 2 - 1
+        wh = np.random.rand(self.OC, 4 * self.OC).astype('float32') * 2 - 1
+
+        # Calculating weight scales
+        # scales = 127 / max(abs(channel_wise(weightsX + weightsH)))
+        s8_max = 127.0
+
+        scale_weights = s8_max / np.max(
+            np.abs(np.concatenate(
+                [wx[:, :], wh[:, :]], axis=0)), axis=0)
+
+        scale_weights = scale_weights.astype('float')
+
+        if self.use_peepholes:
+            b = np.random.rand(1, 7 * self.OC).astype('float32')
+        else:
+            b = np.random.rand(1, 4 * self.OC).astype('float32')
+        w_b = np.copy(b[:, 0:4 * self.OC])
+        w_c = b[:, 4 * self.OC:] if self.use_peepholes else None
+
+        bx = np.random.normal(size=(1, 4 * self.OC)).astype('float32')
+        b[0, 0:4 * self.OC] += bx[0, :]
+
+        if self.has_initial_state:
+            h0 = np.random.rand(N, self.OC).astype('float32')
+            c0 = np.random.rand(N, self.OC).astype('float32')
+        else:
+            h0 = np.zeros((N, self.OC)).astype('float32')
+            c0 = np.zeros((N, self.OC)).astype('float32')
+
+        hidden_f32, c = fusion_lstm(
+            x_f32, self.lod, wx, bx, h0, c0, wh, w_b, w_c, self.is_reverse,
+            ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+            ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x_u8, self.lod),
+            'WeightX': wx,
+            'WeightH': wh,
+            'Bias': b
+        }
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        if self.force_fp32_output:
+            self.error_margin = 1e-1
+            self.outputs = {
+                'Hidden': (hidden_f32, self.lod),
+                'Cell': (c, self.lod)
+            }
+        else:
+            self.error_margin = 2
+            hidden_u8 = np.rint(hidden_f32 * scale_data + shift_data).astype(
+                np.uint8)
+            self.outputs = {
+                'Hidden': (hidden_u8, self.lod),
+                'Cell': (c, self.lod)
+            }
+
+        self.attrs = {
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand,
+            'is_reverse': self.is_reverse,
+            'use_peepholes': self.use_peepholes,
+            'use_mkldnn': self.use_mkldnn,
+            'force_fp32_output': self.force_fp32_output,
+            'Scale_data': scale_data,
+            'Shift_data': shift_data,
+            'Scale_weights': scale_weights
+        }
+
+    def test_check_output(self):
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output(
+                check_dygraph=False,
+                no_check_set=["Cell"],
+                atol=self.error_margin)
+
+
+class TestFusionLSTMINT8MKLDNNOp2(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.force_fp32_output = True
+
+
+class TestFusionLSTMINT8MKLDNNOp4(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.is_reverse = True
+
+
+class TestFusionLSTMINT8MKLDNNOp5(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.has_initial_state = True
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
new file mode 100644
index 00000000000000..a894d042e426c0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
@@ -0,0 +1,185 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+@unittest.skipIf(core.is_compiled_with_cuda(),
+                 "core is compiled with CUDA which has no BF implementation")
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSumDefaultBF16ONEDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.outputs = {'Out': x_fp32.sum(axis=0)}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum4DBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        x_fp32 = np.random.random((5, 10, 5, 5)).astype("float32")
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [2]}
+        self.outputs = {'Out': x_fp32.sum(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum4DReduceAllWithoutReduceAllAttributeBF16ONEDNNOp(
+        TestReduceSumDefaultBF16ONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        x_fp32 = np.random.normal(size=(2, 3, 5, 6)).astype('float32')
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
+        self.outputs = {'Out': x_fp32.sum(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum4DReduceAllWithoutReduceAllAttributeNegativeDimsBF16ONEDNNOp(
+        TestReduceSumDefaultBF16ONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        x_fp32 = np.random.normal(size=(2, 7, 3, 5)).astype('float32')
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [-1, -2, -3, -4]}
+        self.outputs = {'Out': x_fp32.sum(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum5DKeepDimsONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        x_fp32 = np.random.random((2, 5, 3, 2, 2)).astype("float32")
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {
+            'Out': x_fp32.sum(axis=tuple(self.attrs['dim']),
+                              keepdims=self.attrs['keep_dim'])
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum5DReduceAllKeepDimsBF16ONEDNNOp(
+        TestReduceSumDefaultBF16ONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        x_fp32 = np.random.normal(size=(2, 5, 3, 2, 4)).astype('float32')
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {'Out': x_fp32.sum(keepdims=self.attrs['keep_dim'])}
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum4DReduceAllBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        x_fp32 = np.random.normal(size=(4, 3, 2, 3)).astype('float32')
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': x_fp32.sum()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax3DBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'dim': [-1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': x_fp32.max(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax4DNegativeAndPositiveDimsBF16ONEDNNOp(
+        TestReduceSumDefaultBF16ONEDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        x_fp32 = np.random.random((5, 6, 10, 9)).astype("float32")
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': x_fp32.max(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMin3DBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.use_mkldnn = True
+        x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'dim': [2], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': x_fp32.min(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceMean3DBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        x_bf16 = convert_float_to_uint16(x_fp32)
+        self.inputs = {'X': x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': x_fp32.sum(axis=0) / x_fp32.shape[0]}
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
new file mode 100644
index 00000000000000..c913b9eeea27df
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
@@ -0,0 +1,194 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+import paddle
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSumDefaultONEDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum4DONEDNNOp(TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [2]}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum4DReduceAllWithoutReduceAllAttributeONEDNNOp(
+        TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum4DReduceAllWithoutReduceAllAttributeNegativeDimsONEDNNOp(
+        TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [-1, -2, -3, -4]}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum5DKeepDimsONEDNNOp(TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
+        self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum5DReduceAllKeepDimsONEDNNOp(TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(keepdims=self.attrs['keep_dim'])
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceSum4DReduceAllONEDNNOp(TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax3DONEDNNOp(TestReduceSumDefaultONEDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [-1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax4DNegativeAndPositiveDimsONEDNNOp(
+        TestReduceSumDefaultONEDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10, 9)).astype("float32")}
+        self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMin3DONEDNNOp(TestReduceSumDefaultONEDNNOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [2], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceMean3DONEDNNOp(TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [0], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=0) / self.inputs['X'].shape[0]
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceMean4DReduceAllONEDNNOp(TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 8, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out':
+            self.inputs['X'].sum() / np.asarray(self.inputs['X'].shape).prod()
+        }
+
+
+@skip_check_grad_ci(reason="not implemented")
+class TestReduceMeanNoReduce1DOp(TestReduceSumDefaultONEDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((1)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.inputs['X']}
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/new_group.py b/python/paddle/fluid/tests/unittests/new_group.py
new file mode 100644
index 00000000000000..fb7beeee1df2e8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/new_group.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+
+
+class TestNewGroupAPI(object):
+    def __init__(self):
+        paddle.distributed.init_parallel_env()
+        d1 = np.array([1, 2, 3])
+        d2 = np.array([2, 3, 4])
+        self.tensor1 = paddle.to_tensor(d1)
+        self.tensor2 = paddle.to_tensor(d2)
+
+    def test_all(self):
+        gp = paddle.distributed.new_group([0, 1])
+        print("test new group api ok")
+
+        tmp = np.array([0, 0, 0])
+        result = paddle.to_tensor(tmp)
+        paddle.distributed.scatter(
+            result, [self.tensor2, self.tensor1],
+            src=0,
+            group=gp,
+            use_calc_stream=True)
+        if gp.rank == 0:
+            assert np.array_equal(result, self.tensor2)
+        elif gp.rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test scatter api ok")
+
+        paddle.distributed.broadcast(
+            result, src=1, group=gp, use_calc_stream=True)
+        assert np.array_equal(result, self.tensor1)
+        print("test broadcast api ok")
+
+        paddle.distributed.reduce(result, dst=0, group=gp, use_calc_stream=True)
+        if gp.rank == 0:
+            assert np.array_equal(result,
+                                  paddle.add(self.tensor1, self.tensor1))
+        elif gp.rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test reduce api ok")
+
+        paddle.distributed.all_reduce(result, use_calc_stream=True)
+        assert np.array_equal(
+            result,
+            paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
+        print("test all_reduce api ok")
+
+        paddle.distributed.wait(result, gp, use_calc_stream=True)
+        paddle.distributed.wait(result, gp, use_calc_stream=False)
+        print("test wait api ok")
+
+        result = []
+        paddle.distributed.all_gather(
+            result, self.tensor1, group=gp, use_calc_stream=True)
+        assert np.array_equal(result[0], self.tensor1)
+        assert np.array_equal(result[1], self.tensor1)
+        print("test all_gather api ok")
+
+        paddle.distributed.barrier(group=gp)
+        print("test barrier api ok")
+
+        return
+
+
+if __name__ == "__main__":
+    gpt = TestNewGroupAPI()
+    gpt.test_all()
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
new file mode 100644
index 00000000000000..f71e04c09aa38b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
new file mode 100644
index 00000000000000..b5175bdb19c7e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
@@ -0,0 +1,122 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAccuracy(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
+        label = pred.copy()
+        accuracy = np.array([1]).astype(self.dtype)
+        correct = np.array([11 * 1]).astype(self.dtype)
+        total = np.array([11 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestAccuracy2(TestAccuracy):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
+        label = np.random.uniform(4, 5, [11, 1]).astype(self.dtype)
+        accuracy = np.array([0]).astype(self.dtype)
+        correct = np.array([11 * 0]).astype(self.dtype)
+        total = np.array([11 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+
+class TestAccuracy3(TestAccuracy):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        a = np.random.randint(1, 2, [5, 1])
+        b = np.random.randint(0, 1, [5, 1])
+        pred = np.row_stack((a, b)).astype(self.dtype)
+        label = np.random.randint(1, 2, [10, 1]).astype(self.dtype)
+        accuracy = np.array([0.5]).astype(self.dtype)
+        correct = np.array([5]).astype(self.dtype)
+        total = np.array([10 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+
+class TestAccuracyInt(TestAccuracy):
+    def init_dtype(self):
+        self.dtype = np.int
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
new file mode 100644
index 00000000000000..ebf041388eeab9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -0,0 +1,148 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from test_adam_op import adam_step
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSGD(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+
+
+'''
+# TODO(zhiqiu): The following test may let 0-3 card down.
+# we need to analyze it and open it.
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+'''
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
new file mode 100644
index 00000000000000..4cda0ceeccf9c7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -0,0 +1,123 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCheckFiniteAndUnscaleOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "check_finite_and_unscale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x / scale)],
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCheckFiniteAndUnscaleOpWithNan(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "check_finite_and_unscale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        x[128][128] = np.nan
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        # When input contains nan, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(
+            self.place, check_dygraph=False, no_check_set=['Out'])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCheckFiniteAndUnscaleOpWithInf(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "check_finite_and_unscale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        x[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(
+            self.place, check_dygraph=False, no_check_set=['Out'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
new file mode 100755
index 00000000000000..b39771e29c7b47
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
@@ -0,0 +1,77 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCast1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('float16')}
+        self.outputs = {'Out': ipt.astype('float32')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP16),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
new file mode 100644
index 00000000000000..54a2c1e7163a9f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
@@ -0,0 +1,145 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestEqual(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "equal"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = x == y  # all elements are not equal
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLessthan(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "less_than"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = x < y
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestEqual2(TestEqual):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "equal"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = x.copy()
+        y[0][1] = 1
+        out = x == y  # all elements are equal, except position [0][1]
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+
+class TestLessthan2(TestLessthan):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "less_than"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = x.copy()
+        y[0][1] = 1
+        out = x < y  # all elements are equal, except position [0][1]
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+
+class TestEqual2FP16(TestEqual2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestEqual2Int(TestEqual2):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+class TestLessthan2FP16(TestLessthan2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
new file mode 100644
index 00000000000000..a2ec1c7a9eef6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
@@ -0,0 +1,115 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestConcat(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "concat"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_test_data()
+
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+        self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+        self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+        self.axis = 0
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['x0', 'x2'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(
+            self.place, ['x1'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(
+            self.place, ['x2'], 'Out', check_dygraph=False)
+
+
+class TestConcatFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "concat"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_test_data()
+
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+        self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+        self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+        self.axis = 0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
new file mode 100644
index 00000000000000..6a82157faaec41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_add"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X', 'Y'],
+            'Out',
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            no_grad_set=set("Y"),
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAddAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+
+            x_reshape = paddle.reshape(x, [3, 1])
+            y_reshape = paddle.reshape(y, [3, 1])
+            z = paddle.add(x_reshape, y_reshape)
+            z = paddle.reshape(z, shape=[3])
+
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, y_value, z_value = exe.run(feed={"x": x_np,
+                                                      "y": y_np},
+                                                fetch_list=[x, y, z])
+
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (y_value == y_np).all(),
+                True,
+                msg="y_value = {}, but expected {}".format(y_value, y_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAddError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, paddle.add, x1, y1)
+
+            # the input dtype must be float16 or float32 or float64 or int32 or int64
+            x2 = paddle.static.data(
+                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = paddle.static.data(
+                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, paddle.add, x2, y2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
new file mode 100644
index 00000000000000..0ae2678d10b47c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
@@ -0,0 +1,183 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDiv(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_div"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.divide(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X', 'Y'],
+            'Out',
+            max_relative_error=0.007,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'],
+            'Out',
+            max_relative_error=0.007,
+            no_grad_set=set("X"),
+            check_dygraph=False)
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', no_grad_set=set("Y"), check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDivFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_div"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.divide(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDivNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        b_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        c_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        d_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
+            d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            e = paddle.multiply(a, b)
+            f = paddle.multiply(c, d)
+            f.stop_gradient = True
+            g = fluid.layers.elementwise_div(e, f)
+
+            fc_1 = fluid.layers.fc(input=g, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
new file mode 100644
index 00000000000000..93538e938670f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
@@ -0,0 +1,67 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseFloorDiv(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_floordiv"
+        self.set_npu()
+        self.init_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = "int64"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseFloorDiv2(TestElementwiseFloorDiv):
+    def init_dtype(self):
+        self.dtype = "int32"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
new file mode 100644
index 00000000000000..b4d9c7285b2b55
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMin(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_min"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.minimum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Min grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMinFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_min"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.minimum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMinNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.minimum(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
new file mode 100644
index 00000000000000..9bfb7e033e7ea4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
@@ -0,0 +1,171 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMul(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.multiply(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Mul grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMulFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.multiply(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        c_np = np.random.random(size=(32, 32)).astype('float32')
+        d_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
+            d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            e = paddle.multiply(a, b)
+            f = paddle.multiply(c, d)
+            f.stop_gradient = True
+            g = paddle.multiply(e, f)
+
+            fc_1 = fluid.layers.fc(input=g, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
new file mode 100644
index 00000000000000..862c546b8e05eb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePow(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Pow grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePowFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.power(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.pow(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
new file mode 100644
index 00000000000000..8c6c7b46f49f27
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -0,0 +1,224 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseSubOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_sub"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault
+    #  when call op.run, which may be caused by system environment exception
+    #  and the exact cause has not be located.
+    # def test_check_grad_normal(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X', 'Y'],
+    #         'Out',
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_x(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['Y'],
+    #         'Out',
+    #         no_grad_set=set("X"),
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_y(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X'],
+    #         'Out',
+    #         no_grad_set=set("Y"),
+    #         max_relative_error=0.006,check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.subtract(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+
+            x_reshape = paddle.reshape(x, [3, 1])
+            y_reshape = paddle.reshape(y, [3, 1])
+            z = paddle.subtract(x_reshape, y_reshape)
+            z = paddle.reshape(z, shape=[3])
+
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, y_value, z_value = exe.run(feed={"x": x_np,
+                                                      "y": y_np},
+                                                fetch_list=[x, y, z])
+
+            z_expected = np.array([1., -2., 2.])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (y_value == y_np).all(),
+                True,
+                msg="y_value = {}, but expected {}".format(y_value, y_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, paddle.subtract, x1, y1)
+
+            # the input dtype must be float16 or float32 or float64 or int32 or int64
+            x2 = paddle.static.data(
+                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = paddle.static.data(
+                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, paddle.subtract, x2, y2)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            c = paddle.assign(b)
+            z = paddle.subtract(sum, c)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        npu_pred, npu_loss = self._test(True)
+        cpu_pred, cpu_loos = self._test(False)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loos))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
new file mode 100644
index 00000000000000..f6a84d3be5c100
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -0,0 +1,144 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpand(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "expand"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        out = np.tile(x, [1, 10, 1])
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'expand_times': [1, 10, 1]}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandV2(TestExpand):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "expand"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        out = np.tile(x, [1, 10, 1])
+        expand_times = np.array([1, 10, 1]).astype(np.int32)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'ExpandTimes': OpTest.np_dtype_to_fluid_dtype(expand_times)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandFp16(TestExpand):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 1)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 1], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            res = paddle.fluid.layers.expand(a, [1, 32])
+            loss = res.sum()
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        for epoch in range(100):
+
+            loss_res = exe.run(main_prog,
+                               feed={"a": a_np,
+                                     "label": label_np},
+                               fetch_list=[loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Loss: {}".format(epoch, loss))
+
+        return loss_res
+
+    def test_npu(self):
+        cpu_loss = self._test(False)
+        npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
new file mode 100644
index 00000000000000..6e619bfd11fb90
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestFillConstant(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+        self.init_dtype()
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 3.8}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFillConstantInt(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 1,
+            'dtype': core.VarDesc.VarType.INT32
+        }
+        self.outputs = {'Out': np.full((123, 92), 1).astype(self.dtype)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFillConstantFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 1.0,
+            'dtype': core.VarDesc.VarType.FP16
+        }
+        self.outputs = {'Out': np.full((123, 92), 1.0).astype(self.dtype)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
new file mode 100644
index 00000000000000..008422ffd21188
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
@@ -0,0 +1,177 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+def gather_numpy(x, index, axis):
+    x_transpose = np.swapaxes(x, 0, axis)
+    tmp_gather = x_transpose[index, ...]
+    gather = np.swapaxes(tmp_gather, 0, axis)
+    return gather
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGatherOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "gather"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            'X': xnp,
+            'Index': np.array(self.index).astype(self.index_type)
+        }
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase1(TestGatherOp):
+    def config(self):
+        """
+        For one dimension input
+        """
+        self.x_shape = (100)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class API_TestGather(unittest.TestCase):
+    def test_out1(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float32')
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.fluid.layers.gather(data1, index)
+            place = paddle.NPUPlace(0)
+            exe = fluid.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]])
+            index_1 = np.array([1, 2])
+            result, = exe.run(feed={"data1": input,
+                                    "index": index_1},
+                              fetch_list=[out])
+            expected_output = np.array([[3, 4], [5, 6]])
+        self.assertTrue(np.allclose(result, expected_output))
+
+    def test_out2(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.fluid.data('x', shape=[-1, 2], dtype='float32')
+            index = paddle.fluid.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.gather(x, index)
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float32')
+            index_np = np.array([1, 1]).astype('int32')
+            result, = exe.run(feed={"x": x_np,
+                                    "index": index_np},
+                              fetch_list=[out])
+            expected_output = gather_numpy(x_np, index_np, axis=0)
+        self.assertTrue(np.allclose(result, expected_output))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGatherGrad(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(8192, 768)).astype('float32')
+        index_np = np.random.randint(0, 8192, size=(1232, 1)).astype('int32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[8192, 768], dtype='float32')
+            index = paddle.static.data(
+                name="index", shape=[1232, 1], dtype='int32')
+            a.stop_gradient = False
+            b = paddle.gather(a, index)
+
+            loss = fluid.layers.reduce_mean(b)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={"a": a_np,
+                                               "index": index_np},
+                                         fetch_list=[b, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res[0]))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        npu_pred, npu_loss = self._test(True)
+        cpu_pred, cpu_loss = self._test(False)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
new file mode 100644
index 00000000000000..efa1918206b035
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+from scipy import special
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+def np_gelu(x):
+    y = 0.5 * x * (1 + special.erf(x / np.sqrt(2)))
+    return y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGelu(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "gelu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np_gelu(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGeluFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "gelu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np_gelu(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGeluNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = fluid.layers.gelu(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-3))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-3))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
new file mode 100644
index 00000000000000..a102f3d9ce185f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
@@ -0,0 +1,119 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+NPUPlace = 5
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrement(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(NPUPlace)
+        self.op_type = "increment"
+        self.init_dtype()
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)),
+        }
+
+        self.attrs = {"Step": 1}
+        self.outputs = {'Out': np.array([2])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.int64
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrementFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(NPUPlace)
+        self.op_type = "increment"
+        self.init_dtype()
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)),
+        }
+        self.pre_input_id = id(self.inputs['X'])
+
+        self.attrs = {"Step": 1}
+        self.outputs = {'Out': np.array([2])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrementInplace(unittest.TestCase):
+    def test_npu(self):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.array([1]).astype('float32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[1], dtype='float32')
+            b = fluid.layers.increment(a)
+
+        place = paddle.NPUPlace(NPUPlace)
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        b_value = exe.run(main_prog, feed={"a": a_np, }, fetch_list=[b])
+
+        print('input a id is : {}'.format(id(a)))
+        print('input b id is : {}'.format(id(b)))
+
+        self.assertEqual(id(a), id(b))
+        self.assertEqual(b_value[0], 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
new file mode 100644
index 00000000000000..d447dfb8d4d031
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
@@ -0,0 +1,203 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from functools import reduce
+from operator import mul
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_layer_norm_op import _reference_layer_norm_naive, _reference_layer_norm_grad
+
+paddle.enable_static()
+
+SEED = 2021
+EPOCH = 100
+
+from op_test import _set_use_system_allocator
+
+_set_use_system_allocator(False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.use_cudnn = True
+        self.set_npu()
+        self.init_dtype()
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.atol = 1e-4
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(
+            np.allclose(
+                np.array(tensor).astype(np_array.dtype), np_array, atol=atol),
+            msg)
+
+    def check_forward_backward(self,
+                               shape,
+                               begin_norm_axis,
+                               has_scale=True,
+                               has_bias=True,
+                               y_grad_scale=1.0,
+                               use_mkldnn=False):
+        def test_with_place(place,
+                            shape,
+                            begin_norm_axis,
+                            use_mkldnn=use_mkldnn):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(self.dtype)
+            scale = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_scale else None
+            bias = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_bias else None
+            y_grad = (np.random.random_sample(x_shape) *
+                      y_grad_scale).astype(self.dtype)
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
+            if has_scale:
+                var_names += ['scale']
+            if has_bias:
+                var_names += ['bias']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype=self.dtype,
+                        shape=ground_truth[name].shape)
+                inputs = {"X": block.var('x')}
+                fetch_list = [
+                    'y',
+                    'mean',
+                    'variance',
+                    'x@GRAD',
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var('scale')
+                    fetch_list += ['scale@GRAD']
+                if has_bias:
+                    inputs["Bias"] = block.var('bias')
+                    fetch_list += ['bias@GRAD']
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn
+                    })
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=fetch_list)
+                self.__assert_close(y, out[0], "y", self.atol)
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad", 1e-2)
+                if has_scale:
+                    self.__assert_close(scale_grad,
+                                        out[fetch_list.index('scale@GRAD')],
+                                        "scale_grad", 1e-2)
+                if has_bias:
+                    self.__assert_close(bias_grad,
+                                        out[fetch_list.index('bias@GRAD')],
+                                        "bias_grad", self.atol)
+
+        test_with_place(self.place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=True)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=True,
+            has_bias=False)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLayerNormOpFP16(TestLayerNormOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+        self.atol = 1e-2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
new file mode 100644
index 00000000000000..3cdd2448628a0b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLog(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "log"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "log"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.log(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
new file mode 100644
index 00000000000000..a2b54be3a14823
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
@@ -0,0 +1,122 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogicalNot(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "logical_not"
+        self.place = paddle.NPUPlace(4)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.logical_not(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.bool
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogcialNotNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('bool')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='bool')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.logical_not(a)
+            d = paddle.cast(c, dtype="float32")
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(4)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={"a": a_np,
+                                               "label": label_np},
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
new file mode 100644
index 00000000000000..400ddd9d4aab07
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -0,0 +1,89 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLookupTableV2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "lookup_table_v2"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        bsz = 6
+        seqlen = 8
+        vocab = 10
+        dim = 20
+        w = np.ones([vocab, dim]).astype(self.dtype)
+        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64)
+        out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
+
+        self.inputs = {
+            'W': OpTest.np_dtype_to_fluid_dtype(w),
+            'Ids': OpTest.np_dtype_to_fluid_dtype(x)
+        }
+        self.attrs = {
+            'is_sparse': False,
+            'is_distributed': False,
+            'remote_prefetch': False,
+            'padding_idx': -1
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['W'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLookupTableV2FP16(TestLookupTableV2):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
new file mode 100644
index 00000000000000..b27b9c0b975607
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -0,0 +1,210 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    return Out
+
+
+class TestMatMul(OpTest):
+    def config(self):
+        self.x_shape = (100, 24)
+        self.y_shape = (24, 100)
+        self.trans_x = False
+        self.trans_y = False
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "matmul_v2"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
+        result = reference_matmul(x, y, self.trans_x, self.trans_y)
+        result = result.astype(self.dtype)
+        self.inputs = {
+            'X': x,
+            'Y': y,
+        }
+        self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
+        self.outputs = {'Out': result}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+class TestMatMul2(TestMatMul):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (32, 24)
+        self.y_shape = (32, 24)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatMul3(TestMatMul):
+    """
+    case 3
+    """
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestMatMul4(TestMatMul):
+    """
+    case 4 dim=3
+    """
+
+    def config(self):
+        self.x_shape = (2, 3, 4)
+        self.y_shape = (2, 4, 3)
+        self.trans_x = False
+        self.trans_y = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMatMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3)).astype('float32')
+        b_np = np.random.random(size=(2, 3)).astype('float32')
+        c_np = np.random.random(size=(3, 2)).astype('float32')
+        d_np = np.random.random(size=(3, 2)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3], dtype='float32')
+            c = paddle.static.data(name="c", shape=[3, 2], dtype='float32')
+            d = paddle.static.data(name="d", shape=[3, 2], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.matmul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
new file mode 100644
index 00000000000000..6e8f99a9dbb197
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
@@ -0,0 +1,88 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMean(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([1, 100]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMeanFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([3, 200]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
new file mode 100644
index 00000000000000..e65a3dac73928c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -0,0 +1,326 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestMul(OpTest):
+    def config(self):
+        self.x_shape = (32, 5)
+        self.y_shape = (5, 100)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+    #
+class TestMulFP16(TestMul):
+    """
+    case 2
+    """
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestMul3(TestMul):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 5)
+        self.y_shape = (10, 5)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.outputs = {
+            'Out': np.dot(self.inputs['X'].reshape(2, 10), self.inputs['Y'])
+        }
+
+
+class TestMul4(TestMul):
+    """
+    case 4
+    """
+
+    def config(self):
+        self.x_shape = (2, 3, 4)
+        self.y_shape = (4, 5)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.attrs = {"x_num_col_dims": 2}
+        self.outputs = {'Out': np.matmul(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3)).astype('float32')
+        b_np = np.random.random(size=(2, 3)).astype('float32')
+        c_np = np.random.random(size=(3, 2)).astype('float32')
+        d_np = np.random.random(size=(3, 2)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3], dtype='float32')
+            c = paddle.static.data(name="c", shape=[3, 2], dtype='float32')
+            d = paddle.static.data(name="d", shape=[3, 2], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("TestMulNet Start run on {} . ".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet3_2(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        c_np = np.random.random(size=(12, 5)).astype('float32')
+        d_np = np.random.random(size=(12, 5)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            c = paddle.static.data(name="c", shape=[12, 5], dtype='float32')
+            d = paddle.static.data(name="d", shape=[12, 5], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("testMulNet3_2 tart run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet3_2_xc2(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        c_np = np.random.random(size=(4, 5)).astype('float32')
+        d_np = np.random.random(size=(4, 5)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            c = paddle.static.data(name="c", shape=[4, 5], dtype='float32')
+            d = paddle.static.data(name="d", shape=[4, 5], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2, x_num_col_dims=2)
+            result_re = paddle.reshape(result, shape=[2, 15])
+
+            fc_1 = fluid.layers.fc(input=result_re, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("TestMulNet3_2_xc2. Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
new file mode 100644
index 00000000000000..3f71fad2b9c108
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
@@ -0,0 +1,61 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNpuPlace(unittest.TestCase):
+    def test(self):
+        p = core.Place()
+        p.set_place(paddle.NPUPlace(0))
+
+        self.assertTrue(p.is_npu_place())
+        self.assertEqual(p.npu_device_id(), 0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNpuPlaceError(unittest.TestCase):
+    def test_static(self):
+        # NPU is not supported in ParallelExecutor
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+        compiled_prog = paddle.static.CompiledProgram(prog)
+        place = paddle.NPUPlace(0)
+        exe = paddle.static.Executor(place)
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    "NPU is not supported in ParallelExecutor"):
+            exe.run(compiled_prog, feed={"x": x_np, "y": y_np})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
new file mode 100644
index 00000000000000..8c67766b31184a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
@@ -0,0 +1,151 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPow(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 3.0}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.power(x, 2)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 2.0}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
new file mode 100644
index 00000000000000..087256b2980886
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
@@ -0,0 +1,133 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAny8DOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 5, 4)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAnyOpWithDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].any(axis=1)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAny8DOpWithDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 6)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAnyOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestAny8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
new file mode 100644
index 00000000000000..d3861bf0780cb5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
@@ -0,0 +1,206 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSum(OpTest):
+    def setUp(self):
+        np.random.seed(SEED)
+        self.set_npu()
+        self.init_dtype()
+        self.place = paddle.NPUPlace(0)
+        self.init_op_type()
+        self.initTestCase()
+
+        self.use_mkldnn = False
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keep_dim,
+            'reduce_all': self.reduce_all
+        }
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
+        if self.attrs['reduce_all']:
+            self.outputs = {'Out': self.inputs['X'].sum()}
+        else:
+            self.outputs = {
+                'Out': self.inputs['X'].sum(axis=self.axis,
+                                            keepdims=self.attrs['keep_dim'])
+            }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_op_type(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = False
+        self.keep_dim = False
+        self.reduce_all = False
+
+    def initTestCase(self):
+        self.shape = (5, 6)
+        self.axis = (0, )
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+class TestReduceSum2(OpTest):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet(unittest.TestCase):
+    def set_reduce_sum_function(self, x):
+        # keep_dim = False
+        return paddle.fluid.layers.reduce_sum(x, dim=-1)
+
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            a_1 = fluid.layers.fc(input=a, size=4, num_flatten_dims=2, act=None)
+            b_1 = fluid.layers.fc(input=b, size=4, num_flatten_dims=2, act=None)
+            z = paddle.add(a_1, b_1)
+            z_1 = self.set_reduce_sum_function(z)
+
+            prediction = fluid.layers.fc(input=z_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet2(TestReduceSumNet):
+    def set_reduce_sum_function(self, x):
+        # keep_dim = True
+        return paddle.fluid.layers.reduce_sum(x, dim=-1, keep_dim=True)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet3(TestReduceSumNet):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+
+            z = paddle.add(a, b)
+            loss = fluid.layers.reduce_sum(z)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            loss_res = exe.run(main_prog,
+                               feed={"a": a_np,
+                                     "b": b_np},
+                               fetch_list=[loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Loss: {}".format(epoch, loss_res))
+
+        return loss_res, loss_res
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
new file mode 100644
index 00000000000000..9273d01299d8f5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -0,0 +1,176 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestRelu(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluNeg(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.array([0.1, -0.1, -1.0]).astype(self.dtype)
+        out = np.array([0.1, 0.0, 0.0]).astype(self.dtype)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+#
+#
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.nn.functional.relu(sum)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
new file mode 100644
index 00000000000000..885c990c702bd3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
@@ -0,0 +1,77 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReshape2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reshape2"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_data()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_data(self):
+        self.ori_shape = (2, 100)
+        self.new_shape = (20, 10)
+        self.infered_shape = (20, 10)
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            self.place, check_dygraph=False, no_check_set=['XShape'])
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+class TestReshape2_case2(TestReshape2):
+    def init_data(self):
+        self.ori_shape = (2, 100)
+        self.new_shape = (-1, 10)
+        self.infered_shape = (20, 10)
+
+
+class TestReshape2_case3(TestReshape2):
+    def init_data(self):
+        self.ori_shape = (100, 5, 6)
+        self.new_shape = (-1, 0, 3)
+        self.infered_shape = (200, 5, 3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
new file mode 100644
index 00000000000000..9b4547bc24474a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
@@ -0,0 +1,89 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestScale(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(
+                np.random.random((10, 10)).astype(self.dtype))
+        }
+        self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFP16Scale(TestScale):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestBiasAfterScale(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(
+                np.random.random((10, 10)).astype(self.dtype))
+        }
+        self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': False}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
new file mode 100755
index 00000000000000..c3e52c9bfad533
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
@@ -0,0 +1,126 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCast1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.random.random((1, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("int32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.zeros((1, 2)).astype("int32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.random.random((1, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] += updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': False}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast4(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[1] = updates_np[0]
+        output_np[2] = updates_np[1]
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
new file mode 100644
index 00000000000000..af0dea4776d23f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
@@ -0,0 +1,119 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSGD(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "sgd"
+        self.conf()
+        w = np.random.random((self.h, self.w)).astype("float32")
+        g = np.random.random((self.h, self.w)).astype("float32")
+        lr = np.array([0.1]).astype("float32")
+
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.outputs = {'ParamOut': w - lr * g}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def conf(self):
+        self.h = 12
+        self.w = 15
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
new file mode 100644
index 00000000000000..7b9a74b2be98de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
@@ -0,0 +1,57 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestShape(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "shape"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [5, 10]).astype(self.dtype)
+        out = np.array([5, 10])
+
+        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
new file mode 100644
index 00000000000000..500618f509f682
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+SEED = 2021
+EPOCH = 100
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_npu()
+        self.init_dtype()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', check_dygraph=False)
+
+
+class TestSliceOp2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, -3]
+        self.ends = [3, 3, -1]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, -3:-1, :]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceOpFp16(TestSliceOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+        self.place = paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        batch_size = 32
+        data_shape = (32, 32)
+        a_np = np.random.random(size=data_shape).astype('float32')
+        b_np = np.random.random(size=data_shape).astype('float32')
+        label_np = np.random.randint(2, size=(batch_size, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=data_shape, dtype='float32')
+            b = paddle.static.data(name="b", shape=data_shape, dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[batch_size, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.slice(sum, axes=[0, 1], starts=[0, 0], ends=[33, 2])
+
+            prediction = paddle.static.nn.fc(z, size=2, activation='softmax')
+
+            cost = paddle.nn.functional.cross_entropy(
+                input=prediction, label=label)
+            loss = paddle.mean(cost)
+            sgd = paddle.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+        print("Start run on {}".format(place))
+        for epoch in range(EPOCH):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
new file mode 100644
index 00000000000000..c1ba41943a359b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
@@ -0,0 +1,125 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmax(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "softmax"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        np_out = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmaxNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(4, 32)).astype('float32')
+        b_np = np.random.random(size=(4, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(4, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[4, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[4, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[4, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.sqrt(c)
+
+            # 4 x 128
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            # 4 x 2
+            prediction = fluid.layers.fc(input=fc_1, size=2)
+
+            # 4 x 2
+            prob = fluid.layers.softmax(prediction, axis=1)
+
+            cost = fluid.layers.cross_entropy(input=prob, label=label)
+            loss = fluid.layers.mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-2))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
new file mode 100644
index 00000000000000..1b48268b0e77e6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -0,0 +1,159 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from test_softmax_op import stable_softmax
+from test_softmax_with_cross_entropy_op import cross_entropy
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmaxWithCrossEntropyOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def initParams(self):
+        self.set_npu()
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = False
+        self.place = paddle.NPUPlace(0)
+        self.soft_label = False
+        self.init_dtype()
+        self.axis = -1
+        self.ignore_index = -1
+        self.shape = [41, 37]
+        np.random.seed(SEED)
+
+    def setUp(self):
+        self.initParams()
+
+        logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
+
+        if self.soft_label:
+            labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+            labels /= np.sum(labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
+
+        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
+                             self.ignore_index)
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": loss.astype(self.dtype)
+        }
+        self.attrs = {
+            "numeric_stable_mode": self.numeric_stable_mode,
+            "soft_label": self.soft_label,
+            "ignore_index": self.ignore_index,
+        }
+
+        if self.axis != -1:
+            self.attrs['axis'] = self.axis
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2)
+
+            cost = fluid.layers.softmax_with_cross_entropy(prediction, label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
new file mode 100644
index 00000000000000..556fa76424b8b6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrt(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "sqrt"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrtFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "sqrt"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrtNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.sqrt(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
new file mode 100644
index 00000000000000..8c1a8d0070484a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquare(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "square"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquareFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "square"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.square(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquareNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.square(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
new file mode 100644
index 00000000000000..6db98be9328a43
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
@@ -0,0 +1,153 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestStack1(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestStack2(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (2, 3, 4)
+        self.axis = -1
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestStack3(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (2, 3, 4)
+        self.axis = 1
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
new file mode 100755
index 00000000000000..6d39aa383ce949
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
@@ -0,0 +1,86 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSum1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.NPUPlace(0)
+
+        x0 = np.random.random((3, 40)).astype(self.dtype)
+        x1 = np.random.random((3, 40)).astype(self.dtype)
+        x2 = np.random.random((3, 40)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestSum2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.NPUPlace(0)
+
+        x0 = np.random.random((3, 3)).astype(self.dtype)
+        x1 = np.random.random((3, 3)).astype(self.dtype)
+        x2 = np.random.random((3, 3)).astype(self.dtype)
+        x3 = np.random.random((3, 3)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]}
+        y = x0 + x1 + x2 + x3
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
new file mode 100644
index 00000000000000..235fa2783fb3c8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanh(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "tanh"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanhFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "tanh"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanhNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.tanh(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
new file mode 100644
index 00000000000000..04d4565f748580
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
@@ -0,0 +1,95 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTopk(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "top_k"
+        self.init_dtype()
+
+        x = np.array([[0.78104149, 0.88745828, 0.32362268],
+                      [0.82196718, 0.48763277, 0.42826136],
+                      [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype)
+
+        self.inputs = {'X': x}
+        np_out = np.array(
+            [[0.88745828], [0.82196718], [0.96527182]]).astype(self.dtype)
+        np_indices = np.array([[1], [0], [0]])
+
+        self.attrs = {'k': 1, "axis": -1}
+        self.outputs = {'Out': np_out, 'Indices': np_indices}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTopkV2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "top_k"
+        self.init_dtype()
+
+        x = np.array([[0.78104149, 0.88745828, 0.32362268],
+                      [0.82196718, 0.48763277, 0.42826136],
+                      [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype)
+
+        self.inputs = {'X': x}
+        np_out = np.array([[0.88745828, 0.78104149], [0.82196718, 0.48763277],
+                           [0.96527182, 0.34851612]]).astype(self.dtype)
+        np_indices = np.array([[1, 0], [0, 1], [0, 1]])
+
+        self.attrs = {'k': 2, "axis": -1}
+        self.outputs = {'Out': np_out, 'Indices': np_indices}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
new file mode 100644
index 00000000000000..17f6a0ae1ca9bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -0,0 +1,74 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "transpose2"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
+        self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype)
+        self.out = np.transpose(self.x, [0, 2, 1, 3])
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOpFP16(TestTransposeOp):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
new file mode 100644
index 00000000000000..ff89508d196235
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
@@ -0,0 +1,71 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTruncatedNormal(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        scope = paddle.fluid.core.Scope()
+
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+        paddle.seed(SEED)
+
+        with fluid.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                weight_attr = paddle.framework.ParamAttr(
+                    name="linear_weight",
+                    initializer=paddle.nn.initializer.TruncatedNormal(
+                        mean=0.0, std=2.0))
+                linear = paddle.nn.Linear(
+                    2, 2, weight_attr=weight_attr, bias_attr=False)
+
+            if run_npu:
+                place = paddle.NPUPlace(0)
+            else:
+                place = paddle.CPUPlace()
+
+            exe = paddle.static.Executor(place)
+            w = exe.run(startup_prog, fetch_list=['linear_weight'])
+            return w
+
+    def test_npu(self):
+        cpu_w = self._test(False)
+        npu_w = self._test(True)
+
+        self.assertTrue(np.allclose(npu_w, cpu_w))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
new file mode 100644
index 00000000000000..1060e67078f8d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestUpdateLossScalingOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "update_loss_scaling"
+        self.place = paddle.NPUPlace(0)
+
+        self.init()
+        found_inf = np.array([False], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', x)],
+            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init(self):
+        self.incr_ratio = 2.0
+        self.decr_ratio = 0.8
+        self.dtype = np.float32
+        self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
+        self.num_good_steps = np.array([999], dtype=np.int32)
+        self.num_bad_steps = np.array([1], dtype=np.int32)
+        self.zero_steps = np.array([0], dtype=np.int32)
+        self.attrs = {
+            'incr_every_n_steps': 1000,
+            'decr_every_n_nan_or_inf': 2,
+            'incr_ratio': self.incr_ratio,
+            'decr_ratio': self.decr_ratio,
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            self.place, check_dygraph=False, no_check_set=['Out'])
+
+
+class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "update_loss_scaling"
+        self.place = paddle.NPUPlace(0)
+
+        self.init()
+        found_inf = np.array([True], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        x[i[0]][j[0]] = np.inf
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.decr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestUpdateLossScalingLayer(unittest.TestCase):
+    def loss_scaling_check(self, use_npu=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        found_inf_v = np.array([False]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], a_v)
+        assert np.array_equal(result_v[1], b_v)
+        assert np.array_equal(result_v[0], result_v[2])
+        assert np.array_equal(result_v[1], result_v[3])
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def loss_scaling_check_inf(self, use_npu=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        a_v[i[0]][j[0]] = np.inf
+        found_inf_v = np.array([True]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], np.zeros_like(a_v))
+        assert np.array_equal(result_v[1], np.zeros_like(b_v))
+        assert np.array_equal(result_v[2], np.zeros_like(a_v))
+        assert np.array_equal(result_v[3], np.zeros_like(b_v))
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def test_loss_scaling_cpu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_npu=False)
+
+    def test_loss_scaling_cpu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_npu=False)
+
+    def test_loss_scaling_npu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_npu=True)
+
+    def test_loss_scaling_npu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_npu=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index dff96a8cbc3c4a..583bd3994bd0a4 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -266,7 +266,10 @@ def setUpClass(cls):
         np.random.seed(123)
         random.seed(124)
 
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        if paddle.is_compiled_with_npu():
+            cls._use_system_allocator = _set_use_system_allocator(False)
+        else:
+            cls._use_system_allocator = _set_use_system_allocator(True)
 
     @classmethod
     def tearDownClass(cls):
@@ -298,6 +301,9 @@ def is_mkldnn_op_test():
         def is_rocm_op_test():
             return core.is_compiled_with_rocm()
 
+        def is_npu_op_test():
+            return hasattr(cls, "use_npu") and cls.use_npu == True
+
         if not hasattr(cls, "op_type"):
             raise AssertionError(
                 "This test do not have op_type in class attrs, "
@@ -319,7 +325,8 @@ def is_rocm_op_test():
                 and not hasattr(cls, 'exist_fp64_check_grad') \
                 and not is_xpu_op_test() \
                 and not is_mkldnn_op_test() \
-                and not is_rocm_op_test():
+                and not is_rocm_op_test() \
+                and not is_npu_op_test():
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision." %
                     cls.op_type)
@@ -1216,7 +1223,8 @@ def find_actual(target_name, fetch_list):
         # Check inplace for given op, its grad op, its grad_grad op, etc.
         # No effect on original OpTest
         # Currently not support ParallelExecutor on XPUPlace.
-        if not paddle.is_compiled_with_xpu():
+        if not paddle.is_compiled_with_xpu(
+        ) and not paddle.is_compiled_with_npu():
             self.check_inplace_output_with_place(
                 place, no_check_set=no_check_set, inplace_atol=inplace_atol)
 
@@ -1441,9 +1449,18 @@ def check_grad_with_place(self,
         if not type(output_names) is list:
             output_names = [output_names]
 
+        # FIXME: Replace numeric_place with place to calculate numeric_grads.
+        # NOTE(liym27): There is an unknown error when call op.run() on NPUPlace, which
+        # needs to be fixed.
+        if hasattr(self.__class__,
+                   "use_npu") and self.__class__.use_npu == True:
+            numeric_place = paddle.CPUPlace()
+        else:
+            numeric_place = place
+
         numeric_grads = user_defined_grads or [
             get_numeric_gradient(
-                place,
+                numeric_place,
                 self.scope,
                 self.op,
                 self.inputs,
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
new file mode 100644
index 00000000000000..26c9944abd6c6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle.distributed as dist
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Embedding
+import paddle.nn.functional as F
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+paddle.seed(123)
+np.random.seed(2021)
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, hidden_size, vocab_size, is_sparse=False):
+        super(SimpleNet, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.embedding = Embedding(
+            size=[self.vocab_size, self.hidden_size],
+            dtype='float32',
+            is_sparse=is_sparse)
+
+        self.lin_a = paddle.nn.Linear(self.hidden_size, self.vocab_size)
+        self.lin_b = paddle.nn.Linear(self.vocab_size, 1)
+
+        self.unused_net = paddle.nn.Linear(5, 3)
+        self.phony = self.create_parameter(shape=[1], dtype="float32")
+
+    def forward(self, input, label, conf):
+        x_emb = self.embedding(input)
+        fc = self.lin_a(x_emb)
+        mask = conf > 0
+        mask = paddle.cast(mask, dtype="int64")
+        mask.stop_gradient = True
+        emb_mask = mask.max(1).flatten()
+        emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
+        emb_mask_inds.stop_gradient = True
+
+        if emb_mask_inds.numel() == 0:
+            loss_box = self.phony * 0
+        else:
+            projection = self.lin_b(fc)
+            projection = paddle.reshape(projection, shape=[-1, 1])
+            output = paddle.gather(projection, emb_mask_inds)
+            target = paddle.gather(label, emb_mask_inds)
+            loss_box = F.smooth_l1_loss(
+                output, target, reduction='sum', delta=1.0)
+            loss_box = loss_box / len(conf)
+
+        return loss_box
+
+
+# global configs
+batch_size = 4
+batch_num = 2000
+hidden_size = 5
+vocab_size = 100
+
+conf_dataset = [[0], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1],
+                [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0], [1]]
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.randint(0, vocab_size)
+            y_data = np.random.random_sample((1, )).astype('float32')
+            conf_data = np.array(conf_dataset[i % len(conf_dataset)]).astype(
+                'int64')
+            yield x_data, y_data, conf_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet(
+            hidden_size=hidden_size, vocab_size=vocab_size, is_sparse=False)
+
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x[0] for x in batch]).astype('int64')
+        y_data = np.array([x[1] for x in batch]).astype('float32')
+        conf_data = np.array([x[2] for x in batch]).astype('int64')
+        x_data = x_data.reshape((-1, 1))
+        y_data = y_data.reshape((-1, 1))
+        conf_data = conf_data.reshape((-1, 1))
+
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        conf = paddle.to_tensor(conf_data)
+
+        loss = model(x, y, conf)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
new file mode 100644
index 00000000000000..3157d5e4129eeb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.base import to_variable
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+batch_size = 4
+batch_num = 1000
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.net_a = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_b = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_unused = Linear(10, 20)
+        self.step = 0
+
+    def forward(self, x):
+        if self.step % 2 == 0:
+            return self.net_a(x)
+        else:
+            return self.net_b(x)
+
+        self.step = self.step + 1
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = to_variable(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
new file mode 100644
index 00000000000000..0d2631fa108d28
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+
+paddle.seed(1024)
+np.random.seed(2021)
+
+batch = 5
+in_dim = 10
+out_dim = 20
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, train_id):
+        super(SimpleNet, self).__init__()
+        self.w1 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.w2 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.share_net = Linear(out_dim, 10)
+
+        self.unused_param = self.create_parameter(
+            shape=[out_dim, in_dim], dtype="float64")
+
+        # just for test sync_params_buffers
+        self.register_buffer("queue", paddle.randn([10, 5]))
+        self.queue = paddle.nn.functional.normalize(self.queue, axis=0)
+        self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))
+
+        self.trainer_id = train_id
+
+    def forward(self, x):
+        is_use = (paddle.equal_all(
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
+                  self.trainer_id == 1)
+
+        if is_use:
+            tmp = paddle.matmul(x, self.w1)
+        else:
+            tmp = paddle.matmul(x, self.w2)
+
+        return self.share_net(tmp)
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        dist.init_parallel_env()
+        self.trainer_id = dist.get_rank()
+
+        model_a = SimpleNet(self.trainer_id)
+        model_b = SimpleNet(self.trainer_id)
+
+        state_dict = model_a.state_dict()
+        model_b.set_state_dict(state_dict)
+
+        model_a = paddle.DataParallel(model_a)
+        model_b = paddle.DataParallel(model_b)
+
+        ones_input = paddle.ones(shape=(batch, in_dim))
+        ones_input.stop_gradient = True
+
+        w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+        w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+
+        for step_id in range(5):
+            random_input = paddle.rand(shape=(batch, in_dim))
+            random_input.stop_gradient = True
+
+            if step_id % 2 == 0:
+                out_a = model_a(random_input)
+                out_b = model_b(random_input)
+            else:
+                out_a = model_a(ones_input)
+                out_b = model_b(ones_input)
+
+            out_a.sum().backward()
+            out_b.sum().backward()
+
+            self.check_gradient(model_a.parameters())
+            self.check_gradient(model_b.parameters())
+
+            # test acc gradient
+            w1_grad_sum = self.check_acc(model_a._layers.w1.grad, w1_grad_sum,
+                                         model_b._layers.w1.grad)
+            w2_grad_sum = self.check_acc(model_a._layers.w2.grad, w2_grad_sum,
+                                         model_b._layers.w2.grad)
+
+            model_a.clear_gradients()
+
+    def check_acc(self, grad, grad_sum, acc_grad):
+        if grad is not None:
+            grad_sum = grad_sum + grad
+            np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6)
+        return grad_sum
+
+    def print_trainer_0(self, *args):
+        if self.trainer_id == 0:
+            print(*args)
+
+    def broadcast_param(self, param, root):
+        paddle.distributed.broadcast(param, root)
+        return param
+
+    def check_gradient(self, params):
+        other_param = []
+        for param in params:
+            if param.trainable and (param._grad_ivar() is not None):
+                grad = param._grad_ivar()
+                other_grad = self.broadcast_param(grad.clone(), root=1)
+                if self.trainer_id == 0:
+                    np.testing.assert_allclose(other_grad.numpy(), grad.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
new file mode 100644
index 00000000000000..fc0246a9720bfd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Linear
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+batch_size = 4
+batch_num = 1000
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.net_a = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_b = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.step = 0
+
+    def forward(self, x):
+        return paddle.to_tensor(0.0, dtype='float32')
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = paddle.to_tensor(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
new file mode 100644
index 00000000000000..facac33e4c60ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
+from paddle.fluid.dygraph.base import to_variable
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        # bias is unused parameters, and it share with net_a
+        super(SimpleNet, self).__init__()
+        self.net_a = Linear(input_dim=10, output_dim=5)
+        self.net_b = Linear(10, 10)
+        self.bias = self.net_a.bias
+
+    def forward(self, x):
+        return self.net_b(x)
+
+
+batch_size = 4
+batch_num = 1000
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = to_variable(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
index 65c242a7023093..a15b263a295086 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
@@ -65,8 +65,6 @@ def __init__(self,
     def forward(self, input, label):
         x_emb = self.embedding(input)
         fc = paddle.matmul(x_emb, self.softmax_weight)
-        # use detach to stop gradient
-        fc = fc.detach()
         fc = paddle.add(fc, self.softmax_bias)
         projection = paddle.reshape(fc, shape=[-1, self.vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
index 1884eef15e9a40..9f877381101e96 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
@@ -37,7 +37,7 @@ def __init__(self,
         self.embedding = Embedding(
             self.vocab_size,
             self.hidden_size,
-            sparse=True,
+            sparse=is_sparse,
             weight_attr=paddle.ParamAttr(
                 name='embedding_param',
                 initializer=paddle.nn.initializer.Uniform(
@@ -105,7 +105,7 @@ def get_model(self):
             vocab_size=vocab_size,
             num_steps=num_steps,
             init_scale=init_scale,
-            is_sparse=True)
+            is_sparse=False)
 
         train_reader = paddle.batch(
             fake_sample_reader(), batch_size=batch_size, drop_last=True)
diff --git a/python/paddle/fluid/tests/unittests/parallel_test.sh b/python/paddle/fluid/tests/unittests/parallel_test.sh
index 9da4f035345d7f..551b7cdb7a43c1 100644
--- a/python/paddle/fluid/tests/unittests/parallel_test.sh
+++ b/python/paddle/fluid/tests/unittests/parallel_test.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 unset https_proxy http_proxy
 export FLAGS_rpc_disable_reuse_port=1
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index cfa487a8354cf8..6c35d445b43b7b 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import paddle.fluid as fluid
+import paddle
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
@@ -25,6 +26,28 @@
 from decorator_helper import prog_scope
 
 
+class TestTanhDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 3, 7, 9]
+        eps = 0.0005
+        dtype = np.float64
+        x = layers.data('x', shape, False, dtype=dtype)
+        x.persistable = True
+        y = paddle.tanh(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestReluDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index bcf80fa4771d36..ea183e9444878d 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1971,9 +1971,9 @@ def test_api(self):
             feed={"x": input},
             fetch_list=[out_1, out_2, res, out_6])
 
-        assert np.array_equal(res_1, np.power(input, 2))
-        assert np.array_equal(res_2, np.power(input, 3))
-        assert np.array_equal(res_6, np.power(input, 3))
+        assert np.allclose(res_1, np.power(input, 2))
+        assert np.allclose(res_2, np.power(input, 3))
+        assert np.allclose(res_6, np.power(input, 3))
 
     def test_error(self):
         in1 = fluid.layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index e4336ab05d58f7..8277256009e72b 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -83,6 +83,8 @@ def initTestCase(self):
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
         self.dynamic_shape = True
         self.use_cudnn = True
+        if paddle.fluid.core.is_compiled_with_rocm():
+            self.use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
         self.align_corners = True
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
new file mode 100644
index 00000000000000..68cb075b90c3a1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+curr_host_ip=`hostname -i`
+python hccl_tools.py --device_num "[0,4)" --server_ip ${curr_host_ip}
+
+export RANK_TABLE_FILE="${PWD}/hccl_4p_0123_${curr_host_ip}.json"
+
+# use ascend
+echo "begin test use ascend npu"
+
+distributed_args="--run_mode=collective --log_dir=testlog"
+python -m paddle.distributed.fleet.launch ${distributed_args} \
+  ascend_group.py fleetascendgroup
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
new file mode 100644
index 00000000000000..f00a3c103c817c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
@@ -0,0 +1,56 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAssign(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "assign"
+        self.init_dtype()
+
+        x = np.rand.random([3, 3])
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        self.outputs = {'Out': x}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.int64
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 31879dae0dad06..e6e15575f2ca63 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -331,5 +331,72 @@ def test_modified(self):
                 np.array_equal(dy_outs[i].numpy(), st_outs[i].numpy()))
 
 
+class TestLayerTo(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.linear = paddle.nn.Linear(2, 2)
+        self.new_grad = np.random.random([2, 2])
+        self.linear.weight._set_grad_ivar(paddle.to_tensor(self.new_grad))
+        buffer = paddle.to_tensor([0.0], dtype='float32')
+        self.linear.register_buffer("buf_name", buffer, persistable=True)
+
+        sublayer = paddle.nn.Conv1D(3, 2, 3)
+        self.linear.add_sublayer(1, sublayer)
+
+    def test_to_api(self):
+        self.linear.to(dtype='double')
+        self.assertEqual(self.linear.weight.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(self.linear.buf_name.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertTrue(np.allclose(self.linear.weight.grad, self.new_grad))
+        self.assertTrue(self.linear.weight._grad_ivar().dtype,
+                        paddle.fluid.core.VarDesc.VarType.FP64)
+
+        self.linear.to()
+        self.assertEqual(self.linear.weight.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(self.linear.buf_name.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertTrue(np.allclose(self.linear.weight.grad, self.new_grad))
+        self.assertTrue(self.linear.weight._grad_ivar().dtype,
+                        paddle.fluid.core.VarDesc.VarType.FP64)
+
+        if paddle.fluid.is_compiled_with_cuda():
+            self.linear.to(device=paddle.CUDAPlace(0))
+            self.assertTrue(self.linear.weight.place.is_gpu_place())
+            self.assertEqual(self.linear.weight.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.buf_name.place.is_gpu_place())
+            self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place(
+            ))
+            self.assertEqual(
+                self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
+
+            self.linear.to(device='gpu:0')
+            self.assertTrue(self.linear.weight.place.is_gpu_place())
+            self.assertEqual(self.linear.weight.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.buf_name.place.is_gpu_place())
+            self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place(
+            ))
+            self.assertEqual(
+                self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
+
+        self.linear.to(device=paddle.CPUPlace())
+        self.assertTrue(self.linear.weight.place.is_cpu_place())
+        self.assertTrue(self.linear.buf_name.place.is_cpu_place())
+        self.assertTrue(self.linear.weight._grad_ivar().place.is_cpu_place())
+
+        self.linear.to(device='cpu')
+        self.assertTrue(self.linear.weight.place.is_cpu_place())
+        self.assertTrue(self.linear.buf_name.place.is_cpu_place())
+        self.assertTrue(self.linear.weight._grad_ivar().place.is_cpu_place())
+
+        self.assertRaises(ValueError, self.linear.to, device=1)
+
+        self.assertRaises(AssertionError, self.linear.to, blocking=1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index ee69a37f943a2e..6a6f85a4832068 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -195,7 +195,13 @@ def test_1d(self):
                 channel_first_x = paddle.transpose(x, [0, 2, 1])
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 1])
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                if core.is_compiled_with_rocm():
+                    # HIP will fail if no atol
+                    self.assertEqual(
+                        np.allclose(
+                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                else:
+                    self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
     def test_2d(self):
         for p in self.places:
@@ -209,7 +215,13 @@ def test_2d(self):
                 channel_first_x = paddle.transpose(x, [0, 3, 1, 2])
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 3, 1])
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                if core.is_compiled_with_rocm():
+                    # HIP will fail if no atol
+                    self.assertEqual(
+                        np.allclose(
+                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                else:
+                    self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
     def test_3d(self):
         for p in self.places:
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index 4b39436842b897..ea1a22780f0931 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -27,8 +27,10 @@ def test_static_layer(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(
+            name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         if weight_np is not None:
             weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
@@ -58,8 +60,10 @@ def test_static_functional(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(
+            name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         if weight_np is not None:
             weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
index a6175aa471d693..153b8fd3e7f6b0 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
@@ -48,8 +48,10 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(
+            name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         pos_weight = None
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
index ba9db2c104f18c..60e9d0a26b380d 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
@@ -42,11 +42,12 @@ def setUp(self):
         size0 = 5
         size1 = 4
         size2 = 5
-        a = np.random.random((batch_size, size0)).astype("float64")
-        b = np.random.random((batch_size, size1)).astype("float64")
-        w = np.random.random((size2, size0, size1)).astype("float64")
-        bias = np.random.random((1, size2)).astype("float64")
-        output = np.zeros((batch_size, size2)).astype("float64")
+        dtype = "float32" if fluid.core.is_compiled_with_rocm() else "float64"
+        a = np.random.random((batch_size, size0)).astype(dtype)
+        b = np.random.random((batch_size, size1)).astype(dtype)
+        w = np.random.random((size2, size0, size1)).astype(dtype)
+        bias = np.random.random((1, size2)).astype(dtype)
+        output = np.zeros((batch_size, size2)).astype(dtype)
         for i in range(size2):
             w_i = w[i, :, :]
             output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
index a9d450e223f1e0..aba95a68ab7908 100644
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
new file mode 100644
index 00000000000000..5de1ebf5813722
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import time
+import threading
+import numpy
+
+import paddle
+paddle.enable_static()
+
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+
+class TestCommunicator(unittest.TestCase):
+    def test_communicator_ps_gpu(self):
+        with open("test_communicator_ps_gpu.txt", "w") as f:
+            data = "1 0.6 1 0.7\n"
+            f.write(data)
+
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["FLAGS_selected_gpus"] = "0"
+        role = role_maker.PaddleCloudRoleMaker()
+
+        fleet.init(role)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        slots_vars = [x, y]
+
+        cost = fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = fluid.layers.mean(cost)
+
+        optimizer = fluid.optimizer.Adam(0.01)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {
+            "launch_barrier": False,
+            "use_ps_gpu": 1,
+        }
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset.set_filelist(["test_communicator_ps_gpu.txt"])
+        dataset.load_into_memory()
+
+        os.environ["TEST_MODE"] = "1"
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup_program)
+        main_program._fleet_opt = {"stat_var_names": [x.name]}
+        fleet.init_worker()
+
+        try:
+            exe.train_from_dataset(main_program, dataset)
+        except ImportError as e:
+            pass
+        except Exception as e:
+            self.assertTrue(False)
+
+        time.sleep(10)
+        fleet.stop_worker()
+        os.remove("./test_communicator_ps_gpu.txt")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 83bba0b0ca1c3a..77eac2fbd7fe04 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -1248,6 +1248,17 @@ def init_paddings(self):
 create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)
 
+# ------------ depthwise conv2d in MIOPEN ---------
+if core.is_compiled_with_rocm():
+    create_test_cudnn_padding_SAME_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_padding_SAME_class(
+        TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_cudnn_channel_last_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_channel_last_class(
+        TestDepthwiseConvWithDilation2_AsyPadding)
+
 create_test_cudnn_channel_last_fp16_class(
     TestConv2DOp_AsyPadding, grad_check=False)
 create_test_cudnn_channel_last_fp16_class(
@@ -1470,35 +1481,59 @@ def run_7():
     not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()),
     "core is not compiled with CUDA or ROCM")
 class TestConv2DEnviron(unittest.TestCase):
-    def run_conv2d_api(self):
-        inputs = fluid.layers.data(
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            name="inputs",
-            dtype="float32")
-        fluid.layers.conv2d(
-            input=inputs,
-            num_filters=4,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=0,
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        x_var = paddle.uniform((2, 3, 5, 5), dtype="float32", min=-1., max=1.)
-        conv = paddle.nn.Conv2D(
-            in_channels=3,
-            out_channels=4,
-            kernel_size=(3, 3),
-            data_format="NCHW")
-        y_var = conv(x_var)
+    def run1(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            inputs = fluid.layers.data(
+                shape=[2, 3, 5, 5],
+                append_batch_size=False,
+                name="inputs",
+                dtype="float32")
+            result = fluid.layers.conv2d(
+                input=inputs,
+                num_filters=4,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=0,
+                dilation=[1, 1],
+                groups=1,
+                data_format="NCHW")
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"inputs": self.input_np},
+                              fetch_list=[result])
+
+    def run2(self, place):
+        with fluid.dygraph.guard(place):
+            inputs = fluid.dygraph.to_variable(self.input_np)
+            conv = paddle.nn.Conv2D(
+                in_channels=3,
+                out_channels=4,
+                kernel_size=(3, 3),
+                data_format="NCHW")
+            result = conv(inputs)
+
+    def run3(self, place):
+        with fluid.dygraph.guard(place):
+            inputs = fluid.dygraph.to_variable(self.input_np)
+            conv = paddle.fluid.dygraph.nn.Conv2D(
+                num_channels=3,
+                num_filters=4,
+                filter_size=(3, 3), )
+            result = conv(inputs)
+
+    def run_all(self, place):
+        self.run1(place)
+        self.run2(place)
+        self.run3(place)
 
     def test_environ(self):
-        fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
-        self.run_conv2d_api()
-        fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
-        self.run_conv2d_api()
+        self.input_np = np.random.random([2, 3, 5, 5]).astype("float32")
+        for place in [paddle.CPUPlace(), paddle.CUDAPlace(0)]:
+            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
+            self.run_all(place)
+            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
+            self.run_all(place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 7aa3d0d16862bd..d5f49919bc9517 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -30,7 +30,7 @@ class TestConvDoubleGradCheck(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(x, 2, 1, groups=1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -57,7 +57,7 @@ class TestConvDoubleGradCheck(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(x, 2, 1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -82,7 +82,7 @@ class TestConvDoubleGradCheckTest1(unittest.TestCase):
     def func(self, place):
         shape = [2, 3, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(x, 2, 1, padding=1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -107,7 +107,7 @@ class TestConv3DDoubleGradCheck(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 3, 4, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(x, 2, 1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -132,7 +132,7 @@ class TestConv3DDoubleGradCheckTest1(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 5, 3, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(x, 2, 1, padding=1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -157,7 +157,7 @@ class TestConv2DoubleGradCheck_AsyPadding(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -188,7 +188,7 @@ class TestConv2DoubleGradCheck_PaddingSAME(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -219,7 +219,7 @@ class TestConv2DoubleGradCheck_PaddingVALID(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -250,7 +250,7 @@ class TestConv2DoubleGradCheck_ChannelLast(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -283,7 +283,7 @@ class TestConv2DoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -316,7 +316,7 @@ class TestConv3DDoubleGradCheck_AsyPadding(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 2, 2, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -347,7 +347,7 @@ class TestConv3DoubleGradCheck_PaddingSAME(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 2, 2, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -379,7 +379,7 @@ class TestConv3DoubleGradCheck_PaddingVALID(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -410,7 +410,7 @@ class TestConv3DDoubleGradCheck_ChannelLast(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 2, 2, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -443,7 +443,7 @@ class TestConv3DDoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 2, 2, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -476,7 +476,7 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
 
         # condition of depthwise conv: 
diff --git a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
index 110cfc47cae412..a4ef15b1f0db3c 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
@@ -32,6 +32,8 @@ def func(self, place):
         shape = [2, 4, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             x, 2, filter_size=1, groups=1, bias_attr=False)
@@ -41,8 +43,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
     def test_grad(self):
         places = []
@@ -60,6 +72,8 @@ def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -74,8 +88,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingSAME(
@@ -85,6 +109,8 @@ def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -99,8 +125,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingVALID(
@@ -110,6 +146,8 @@ def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -124,8 +162,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_ChannelLast(
@@ -135,6 +183,8 @@ def func(self, place):
         shape = [2, 3, 3, 2]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -151,8 +201,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 81e2160a556d2f..ea44e23da249ca 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -18,6 +18,8 @@
 import paddle.fluid as fluid
 import numpy as np
 import unittest
+from test_softmax_op import stable_softmax
+from test_softmax_with_cross_entropy_op import cross_entropy
 
 
 def stable_softmax(x):
@@ -42,6 +44,8 @@ def cross_entropy_loss_1d(input,
     C = input_shape[1]
     out = np.zeros_like(label).astype(np.float64)
     total_weight = 0
+    ###1. compute softmax cross_entropy (with weight)
+    ###   Note: only support hard labels.
     for i in range(N):
         cur_target = label[i]
         if cur_target == ignore_index:
@@ -50,6 +54,8 @@ def cross_entropy_loss_1d(input,
         cur_weight = weight[cur_target] if weight is not None else 1
         total_weight += cur_weight
         out[i] = -log_softmax_out[i][cur_target] * cur_weight
+
+    ###2. deal with reduction 
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
@@ -92,9 +98,632 @@ def cross_entropy_loss_2d(input,
         return out
 
 
+def cross_entropy_soft(softmax,
+                       label,
+                       axis,
+                       N,
+                       weight=None,
+                       reduction='mean',
+                       ignore_index=-100):
+    #1.loss
+    loss = cross_entropy(
+        softmax,
+        label,
+        True,  #soft_label,
+        axis,
+        ignore_index)
+
+    if weight is None and reduction == 'none':
+        return loss
+
+    #2.weight
+    weighted_loss = loss
+    total_weight = N  #for weight is None
+    if weight is not None:
+        weighted_loss = np.zeros_like(loss).astype(np.float64)
+        total_weight = 0
+        for i in range(N):
+            cur_soft_label = label[i]
+            cur_weight = np.dot(weight, cur_soft_label)
+            total_weight += cur_weight
+            weighted_loss[i] = loss[i] * cur_weight
+
+    #3.reduce
+    if reduction == 'none':
+        return weighted_loss
+
+    elif reduction == 'mean':
+        weighted_loss_sum = np.sum(weighted_loss)
+        weighted_loss_mean = weighted_loss_sum / total_weight
+        return weighted_loss_mean
+
+    else:
+        weighted_loss_sum = np.sum(weighted_loss)
+        return weighted_loss_sum
+
+
+def cross_entropy_soft_2d(softmax,
+                          label,
+                          axis,
+                          N,
+                          H,
+                          W,
+                          weight=None,
+                          reduction='mean',
+                          ignore_index=-100):
+    #1.loss
+    loss = cross_entropy(
+        softmax,
+        label,
+        True,  #soft_label,
+        axis,
+        ignore_index)
+
+    if weight is None and reduction == 'none':
+        return loss
+
+    #2.weight
+    weighted_loss = loss
+    total_weight = N  #for weight is None
+    if weight is not None:
+        weighted_loss = np.zeros_like(loss).astype(np.float64)
+        total_weight = 0
+        for i in range(N):
+            for h in range(H):
+                for w in range(W):
+                    cur_soft_label = label[i][h][w]
+                    cur_weight = np.dot(weight, cur_soft_label)
+                    total_weight += cur_weight
+                    weighted_loss[i][h][w] = loss[i][h][w] * cur_weight
+
+    #3.reduce
+    if reduction == 'none':
+        return weighted_loss
+
+    elif reduction == 'mean':
+        weighted_loss_sum = np.sum(weighted_loss)
+        weighted_loss_mean = weighted_loss_sum / total_weight
+        return weighted_loss_mean
+
+    else:
+        weighted_loss_sum = np.sum(weighted_loss)
+        return weighted_loss_sum
+
+
 class CrossEntropyLoss(unittest.TestCase):
+    def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+
+    ###test for deprecated softmax_with_cross_entropy
+    def test_softmax_with_cross_entropy(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        paddle.disable_static()
+        paddle_loss_swce = paddle.nn.functional.softmax_with_cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis)
+
+        paddle_loss_ce = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+
+        self.assertTrue(np.allclose(paddle_loss_swce.numpy(), expected))
+        self.assertTrue(np.allclose(paddle_loss_ce.numpy(), expected))
+
+    ###soft_label test start
+    ###soft_label test 1
+    def test_cross_entropy_loss_soft_1d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype)
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 2
+    def test_cross_entropy_loss_soft_1d_weight(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        if self.soft_label:
+            self.labels = np.random.uniform(0.1, 1.0,
+                                            self.shape).astype(self.dtype)
+            self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            self.labels = np.random.randint(
+                0, axis_dim, self.shape, dtype="int64")
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3.static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype)
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype)
+            weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 3
+    def test_cross_entropy_loss_soft_1d_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2 dygraph 
+        paddle.disable_static()
+        paddle_loss_mean = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=self.weight,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_mean.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype)
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={'input': self.logits,
+                      'label': self.labels},
+                fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 4
+    def test_cross_entropy_loss_soft_1d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype)
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype)
+            weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 5
+    def test_cross_entropy_loss_soft_2d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype)
+            label = fluid.data(
+                name='label',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 6
+    def test_cross_entropy_loss_soft_2d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype)
+            label = fluid.data(
+                name='label',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype)
+            weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test end
+
     def test_cross_entropy_loss_1d_with_mean_ignore(self):
-        input_np = np.random.random([2, 4]).astype(np.float64)
+        input_np = np.random.random([2, 4]).astype(self.dtype)
         label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
         paddle.enable_static()
         prog = fluid.Program()
@@ -102,7 +731,7 @@ def test_cross_entropy_loss_1d_with_mean_ignore(self):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[2, 4], dtype='float64')
+            input = fluid.data(name='input', shape=[2, 4], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(ignore_index=0)
             ret = cross_entropy_loss(input, label)
@@ -131,20 +760,22 @@ def test_cross_entropy_loss_1d_with_mean_ignore(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_mean_ignore(self):
-        input_np = np.random.random([2, 4]).astype(np.float64)
-        label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
-        weight_np = np.random.random([4]).astype(np.float64)  #shape:C
+        N = 100
+        C = 200
+        input_np = np.random.random([N, C]).astype(self.dtype)
+        label_np = np.random.randint(0, C, size=(N)).astype(np.int64)
+        weight_np = np.random.random([C]).astype(self.dtype)
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[2, 4], dtype='float64')
-            label = fluid.data(name='label', shape=[2], dtype='int64')
+            input = fluid.data(name='input', shape=[N, C], dtype=self.dtype)
+            label = fluid.data(name='label', shape=[N], dtype='int64')
             weight = fluid.data(
-                name='weight', shape=[4],
-                dtype='float64')  #weight for each class
+                name='weight', shape=[C],
+                dtype=self.dtype)  #weight for each class
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, ignore_index=0)
             ret = cross_entropy_loss(input, label)
@@ -158,8 +789,6 @@ def test_cross_entropy_loss_1d_with_weight_mean_ignore(self):
                                  },
                                  fetch_list=[ret])
             self.assertIsNotNone(static_ret)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np)[0]
 
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -173,25 +802,26 @@ def test_cross_entropy_loss_1d_with_weight_mean_ignore(self):
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(
             input_np, label_np, weight=weight_np, ignore_index=0)[0]
+
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_mean(self):
-        input_np = np.random.random([2, 4]).astype(np.float64)
+        input_np = np.random.random([2, 4]).astype(self.dtype)
         label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
-        weight_np = np.random.random([4]).astype(np.float64)  #shape:C
+        weight_np = np.random.random([4]).astype(self.dtype)  #shape:C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[2, 4], dtype='float64')
+            input = fluid.data(name='input', shape=[2, 4], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2], dtype='int64')
             weight = fluid.data(
                 name='weight', shape=[4],
-                dtype='float64')  #weight for each class
+                dtype=self.dtype)  #weight for each class
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight)
             ret = cross_entropy_loss(input, label)
 
@@ -222,18 +852,18 @@ def test_cross_entropy_loss_1d_with_weight_mean(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_sum(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
-        weight_np = np.random.random([200]).astype(np.float64)  #C
+        weight_np = np.random.random([200]).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            weight = fluid.data(name='weight', shape=[200], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='sum')
             ret = cross_entropy_loss(input, label)
@@ -262,18 +892,20 @@ def test_cross_entropy_loss_1d_with_weight_sum(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_none(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
-        weight_np = np.random.random([200]).astype(np.float64)  #C
+        weight_np = np.random.random([200]).astype(self.dtype)  #C
+
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            weight = fluid.data(name='weight', shape=[200], dtype=self.dtype)
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='none')
             ret = cross_entropy_loss(input, label)
@@ -304,18 +936,18 @@ def test_cross_entropy_loss_1d_with_weight_none(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_none_func(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N
-        weight_np = np.random.random([200]).astype(np.float64)  #C
+        weight_np = np.random.random([200]).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            weight = fluid.data(name='weight', shape=[200], dtype=self.dtype)
             ret = paddle.nn.functional.cross_entropy(
                 input, label, weight=weight, reduction='none')
 
@@ -345,18 +977,18 @@ def test_cross_entropy_loss_1d_with_weight_none_func(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_mean(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
-        weight_np = np.random.random([200]).astype(np.float64)  #C
+        weight_np = np.random.random([200]).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[100], dtype='float64')
+            weight = fluid.data(name='weight', shape=[100], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss()
             ret = cross_entropy_loss(input, label)
             exe = fluid.Executor(place)
@@ -378,7 +1010,7 @@ def test_cross_entropy_loss_1d_mean(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_sum(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
         paddle.enable_static()
         prog = fluid.Program()
@@ -386,7 +1018,7 @@ def test_cross_entropy_loss_1d_sum(self):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
@@ -411,7 +1043,7 @@ def test_cross_entropy_loss_1d_sum(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_none(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
         paddle.enable_static()
         prog = fluid.Program()
@@ -419,7 +1051,7 @@ def test_cross_entropy_loss_1d_none(self):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
@@ -446,10 +1078,10 @@ def test_cross_entropy_loss_1d_none(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_none(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW1
-        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+        weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
 
         paddle.enable_static()
         prog = fluid.Program()
@@ -458,9 +1090,9 @@ def test_cross_entropy_loss_2d_with_weight_none(self):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='none')
             ret = cross_entropy_loss(input, label)
@@ -491,10 +1123,10 @@ def test_cross_entropy_loss_2d_with_weight_none(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_mean(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
-        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+        weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
@@ -502,9 +1134,9 @@ def test_cross_entropy_loss_2d_with_weight_mean(self):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='mean')
             ret = cross_entropy_loss(input, label)
@@ -533,10 +1165,10 @@ def test_cross_entropy_loss_2d_with_weight_mean(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_sum(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
-        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+        weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
         paddle.enable_static()
 
         prog = fluid.Program()
@@ -545,9 +1177,9 @@ def test_cross_entropy_loss_2d_with_weight_sum(self):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='sum')
             ret = cross_entropy_loss(input, label)
@@ -576,7 +1208,7 @@ def test_cross_entropy_loss_2d_with_weight_sum(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_none(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
@@ -586,7 +1218,7 @@ def test_cross_entropy_loss_2d_none(self):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
@@ -615,7 +1247,7 @@ def test_cross_entropy_loss_2d_none(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_mean(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
@@ -625,7 +1257,7 @@ def test_cross_entropy_loss_2d_mean(self):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='mean')
@@ -654,7 +1286,7 @@ def test_cross_entropy_loss_2d_mean(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_sum(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
@@ -664,7 +1296,7 @@ def test_cross_entropy_loss_2d_sum(self):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
new file mode 100644
index 00000000000000..a7472e7ffd7609
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.dygraph as dg
+from op_test import OpTest
+
+
+class TestTensorBackward(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_tensor_backward(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 100]).astype(dtype)
+            y = np.random.random([100, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    z_tensor.backward(grad_tensor)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+
+class TestBackwardAPI(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_backward_api(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+                    z_tensor2 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward([z_tensor1, z_tensor2],
+                                             [grad_tensor, grad_tensor], True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad))
+
+    def test_backward_single_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward(z_tensor1, grad_tensor, True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+    def test_backward_none_grad_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.ones(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    paddle.autograd.backward(z_tensor1, None)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py
index 195337e80defa9..08697a080445e6 100644
--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -15,54 +15,39 @@
 from __future__ import print_function
 
 import unittest
-from op_test import OpTest
 
-import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-import warnings
-import paddle
 
 
 class TestStaticDeviceManage(unittest.TestCase):
-    def test_cpu_device(self):
-        paddle.set_device('cpu')
+    def _test_device(self, device_name, device_class):
+        paddle.set_device(device_name)
+
         out1 = paddle.zeros(shape=[1, 3], dtype='float32')
         out2 = paddle.ones(shape=[1, 3], dtype='float32')
         out3 = paddle.concat(x=[out1, out2], axis=0)
-        exe = paddle.fluid.Executor()
+
+        exe = paddle.static.Executor()
         exe.run(paddle.fluid.default_startup_program())
         res = exe.run(fetch_list=[out3])
+
         device = paddle.get_device()
-        self.assertEqual(isinstance(exe.place, core.CPUPlace), True)
-        self.assertEqual(device, "cpu")
+        self.assertEqual(isinstance(exe.place, device_class), True)
+        self.assertEqual(device, device_name)
+
+    def test_cpu_device(self):
+        self._test_device("cpu", core.CPUPlace)
 
     def test_gpu_device(self):
         if core.is_compiled_with_cuda():
-            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
-            out2 = paddle.ones(shape=[1, 3], dtype='float32')
-            out3 = paddle.concat(x=[out1, out2], axis=0)
-            paddle.set_device('gpu:0')
-            exe = paddle.fluid.Executor()
-            exe.run(paddle.fluid.default_startup_program())
-            res = exe.run(fetch_list=[out3])
-            device = paddle.get_device()
-            self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
-            self.assertEqual(device, "gpu:0")
+            self._test_device("gpu:0", core.CUDAPlace)
 
     def test_xpu_device(self):
         if core.is_compiled_with_xpu():
-            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
-            out2 = paddle.ones(shape=[1, 3], dtype='float32')
-            out3 = paddle.concat(x=[out1, out2], axis=0)
-            paddle.set_device('xpu:0')
-            exe = paddle.fluid.Executor()
-            exe.run(paddle.fluid.default_startup_program())
-            res = exe.run(fetch_list=[out3])
-            device = paddle.get_device()
-            self.assertEqual(isinstance(exe.place, core.XPUPlace), True)
-            self.assertEqual(device, "xpu:0")
+            self._test_device("xpu:0", core.XPUPlace)
 
 
 class TestImperativeDeviceManage(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
old mode 100644
new mode 100755
index d73698e7e024a8..37494294418f1c
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -501,7 +501,12 @@ def run_trainer(self, args):
                     type(self).__name__,
                     "begin to prepare context in dygraph with nccl2")
                 dygraph.parallel.prepare_context(strategy)
-                model = dygraph.parallel.DataParallel(model, strategy)
+                if not args.find_unused_parameters:
+                    model = dygraph.parallel.DataParallel(
+                        model, strategy, find_unused_parameters=False)
+                else:
+                    model = dygraph.parallel.DataParallel(
+                        model, strategy, find_unused_parameters=True)
                 print_to_err(type(self).__name__, "model built in dygraph")
             out_losses = []
             print_to_err(type(self).__name__, "begin to run dygraph training")
@@ -574,9 +579,14 @@ def run_use_fleet_api_trainer(self, args):
         # get trainer id
         args.trainer_id = paddle.distributed.get_rank()
 
+        # set strategy
+        strategy = fleet.DistributedStrategy()
+        if not args.find_unused_parameters:
+            strategy.find_unused_parameters = False
+
         # 3. init parallel env
         if args.update_method == "nccl2" or "bkcl":
-            fleet.init(is_collective=True)
+            fleet.init(is_collective=True, strategy=strategy)
 
         # 4. train model
         model, train_reader, opt = self.get_model()
@@ -628,6 +638,7 @@ def runtime_main(test_class):
     parser.add_argument('--use_xpu', action='store_true')
     parser.add_argument('--use_dgc', action='store_true')
     parser.add_argument('--accumulate_gradient', action='store_true')
+    parser.add_argument('--find_unused_parameters', action='store_true')
     parser.add_argument('--use_reduce', action='store_true')
     parser.add_argument('--dc_asgd', action='store_true')
     parser.add_argument('--hogwild', action='store_true')
@@ -726,6 +737,7 @@ def setUp(self):
         self._save_model = False
         self._fuse_all_reduce = None
         self._accumulate_gradient = False
+        self._find_unused_parameters = True
         self._setup_config()
 
         global DIST_UT_PORT
@@ -852,6 +864,9 @@ def _run_local(self,
         if self._accumulate_gradient:
             cmd += " --accumulate_gradient"
 
+        if self._find_unused_parameters:
+            cmd += " --find_unused_parameters"
+
         env_local.update(envs)
         print("local_cmd: {}, env: {}".format(cmd, env_local))
 
@@ -1021,6 +1036,9 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
         if self._accumulate_gradient:
             tr_cmd += " --accumulate_gradient"
 
+        if self._find_unused_parameters:
+            tr_cmd += " --find_unused_parameters"
+
         if self._pipeline_mode:
             tr_cmd += " --use_pipeline"
         if self._mp_mode:
@@ -1107,6 +1125,7 @@ def _run_cluster_nccl2(self, model, envs, update_method, check_error_log,
         if check_error_log:
             print("outs[0]:", outs[0])
             print("outs[1]:", outs[1])
+
         return pickle.loads(outs[0]), pickle.loads(outs[1])
 
     def _run_pipeline(self, model, envs, check_error_log, log_name):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
index 16584ee50081ae..a82866a797db15 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
@@ -23,7 +23,6 @@
 
 paddle.enable_static()
 
-
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
new file mode 100644
index 00000000000000..cad7d067e9019b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+import shutil
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["FLAGS_selected_gpus"] = "0"
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss, acc, _ = self.net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"use_ps_gpu": 1, "launch_barrier": False}
+        strategy.a_sync_configs = configs
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
new file mode 100644
index 00000000000000..74c1ccd8a8a763
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+import shutil
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss, acc, _ = self.net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"use_ps_gpu": 1}
+        strategy.a_sync_configs = configs
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(loss)
+
+        fleet.init_server()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_tree_index.py b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
new file mode 100644
index 00000000000000..feb52b18dad3d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from paddle.dataset.common import download, DATA_HOME
+from paddle.distributed.fleet.dataset import TreeIndex
+
+
+class TestTreeIndex(unittest.TestCase):
+    def test_tree_index(self):
+        path = download(
+            "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
+            "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
+
+        tree = TreeIndex("demo", path)
+        height = tree.height()
+        branch = tree.branch()
+        self.assertTrue(height == 14)
+        self.assertTrue(branch == 2)
+        self.assertEqual(tree.total_node_nums(), 15581)
+        self.assertEqual(tree.emb_size(), 5171136)
+
+        # get_layer_codes
+        layer_node_ids = []
+        layer_node_codes = []
+        for i in range(tree.height()):
+            layer_node_codes.append(tree.get_layer_codes(i))
+            layer_node_ids.append(
+                [node.id() for node in tree.get_nodes(layer_node_codes[-1])])
+
+        all_leaf_ids = [node.id() for node in tree.get_all_leafs()]
+        self.assertEqual(sum(all_leaf_ids), sum(layer_node_ids[-1]))
+
+        # get_travel
+        travel_codes = tree.get_travel_codes(all_leaf_ids[0])
+        travel_ids = [node.id() for node in tree.get_nodes(travel_codes)]
+
+        for i in range(height):
+            self.assertIn(travel_ids[i], layer_node_ids[height - 1 - i])
+            self.assertIn(travel_codes[i], layer_node_codes[height - 1 - i])
+
+        # get_ancestor
+        ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2)
+        ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)]
+
+        self.assertEqual(ancestor_ids[0], travel_ids[1])
+        self.assertEqual(ancestor_codes[0], travel_codes[1])
+
+        # get_pi_relation
+        pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2)
+        self.assertEqual(pi_relation[all_leaf_ids[0]], ancestor_codes[0])
+
+        # get_travel_path
+        travel_path_codes = tree.get_travel_path(travel_codes[0],
+                                                 travel_codes[-1])
+        travel_path_ids = [
+            node.id() for node in tree.get_nodes(travel_path_codes)
+        ]
+
+        self.assertEquals(travel_path_ids + [travel_ids[-1]], travel_ids)
+        self.assertEquals(travel_path_codes + [travel_codes[-1]], travel_codes)
+
+        # get_children
+        children_codes = tree.get_children_codes(travel_codes[1], height - 1)
+        children_ids = [node.id() for node in tree.get_nodes(children_codes)]
+        self.assertIn(all_leaf_ids[0], children_ids)
+
+
+class TestIndexSampler(unittest.TestCase):
+    def test_layerwise_sampler(self):
+        path = download(
+            "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
+            "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
+
+        tree = TreeIndex("demo", path)
+
+        layer_nodes = []
+        for i in range(tree.height()):
+            layer_codes = tree.get_layer_codes(i)
+            layer_nodes.append(
+                [node.id() for node in tree.get_nodes(layer_codes)])
+
+        sample_num = range(1, 10000)
+        start_sample_layer = 1
+        seed = 0
+        sample_layers = tree.height() - start_sample_layer
+        sample_num = sample_num[:sample_layers]
+        layer_sample_counts = list(sample_num) + [1] * (sample_layers -
+                                                        len(sample_num))
+        total_sample_num = sum(layer_sample_counts) + len(layer_sample_counts)
+        tree.init_layerwise_sampler(sample_num, start_sample_layer, seed)
+
+        ids = [315757, 838060, 1251533, 403522, 2473624, 3321007]
+        parent_path = {}
+        for i in range(len(ids)):
+            tmp = tree.get_travel_codes(ids[i], start_sample_layer)
+            parent_path[ids[i]] = [node.id() for node in tree.get_nodes(tmp)]
+
+        # check sample res with_hierarchy = False
+        sample_res = tree.layerwise_sample(
+            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], False)
+        idx = 0
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res[idx + j][0] == 315757)
+                self.assertTrue(sample_res[idx + j][1] == 838060)
+                self.assertTrue(sample_res[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res[idx + j][3] == 1)
+                    self.assertTrue(
+                        sample_res[idx + j][2] == parent_path[2473624][i])
+                else:
+                    self.assertTrue(sample_res[idx + j][3] == 0)
+                    self.assertTrue(
+                        sample_res[idx + j][2] != parent_path[2473624][i])
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num)
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res[idx + j][0] == 1251533)
+                self.assertTrue(sample_res[idx + j][1] == 403522)
+                self.assertTrue(sample_res[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res[idx + j][3] == 1)
+                    self.assertTrue(
+                        sample_res[idx + j][2] == parent_path[3321007][i])
+                else:
+                    self.assertTrue(sample_res[idx + j][3] == 0)
+                    self.assertTrue(
+                        sample_res[idx + j][2] != parent_path[3321007][i])
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num * 2)
+
+        # check sample res with_hierarchy = True
+        sample_res_with_hierarchy = tree.layerwise_sample(
+            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], True)
+        idx = 0
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res_with_hierarchy[idx + j][0] ==
+                                parent_path[315757][i])
+                self.assertTrue(sample_res_with_hierarchy[idx + j][1] ==
+                                parent_path[838060][i])
+                self.assertTrue(
+                    sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 1)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] ==
+                                    parent_path[2473624][i])
+                else:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 0)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] !=
+                                    parent_path[2473624][i])
+
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num)
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res_with_hierarchy[idx + j][0] ==
+                                parent_path[1251533][i])
+                self.assertTrue(sample_res_with_hierarchy[idx + j][1] ==
+                                parent_path[403522][i])
+                self.assertTrue(
+                    sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 1)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] ==
+                                    parent_path[3321007][i])
+                else:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 0)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] !=
+                                    parent_path[3321007][i])
+
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == 2 * total_sample_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index fde7ea4b23801e..cc362005f33119 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -520,6 +520,23 @@ def init_grad_input_output(self):
         self.grad_y = self.grad_out
 
 
+class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
+    def test_static_add(self):
+        paddle.enable_static()
+        a = 1.5
+        b = paddle.full([4, 5, 6], True, dtype='bool')
+        c = a + b
+        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
+        paddle.enable_static()
+
+    def test_dygraph_add(self):
+        paddle.disable_static()
+        a = 1.5
+        b = paddle.full([4, 5, 6], True, dtype='bool')
+        c = a + b
+        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py
new file mode 100644
index 00000000000000..6475caf970cba7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMax(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_max"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.maximum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Max grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMaxFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_max"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.maximum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMaxNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.maximum(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
index 32d732d9a80995..385a0c0b6e84cc 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -38,7 +38,7 @@ def __check_out__(self, out):
         if data_type in ['float32', 'float64', 'int32', 'int64']:
             max_value = np.nanmax(out)
             min_value = np.nanmin(out)
-            always_non_full_zero = max_value > min_value
+            always_non_full_zero = max_value >= min_value
             always_full_zero = max_value == 0.0 and min_value == 0.0
             self.assertTrue(always_full_zero or always_non_full_zero,
                             'always_full_zero or always_non_full_zero.')
@@ -146,6 +146,8 @@ def setUp(self):
         self.init_config()
 
     def test_static_graph(self):
+        paddle.enable_static()
+
         dtype = 'float32'
 
         train_program = Program()
@@ -167,6 +169,8 @@ def test_static_graph(self):
         self.dst_shape = x.shape
         self.__check_out__(res[0])
 
+        paddle.disable_static()
+
     def init_config(self):
         self.x_shape = (200, 3)
         self.data_x_shape = [200, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index a325ffe1d0ef46..edda6da655ddd9 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -27,8 +27,10 @@ class TestExpandOpRank1(OpTest):
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"
 
-        self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
+        self.inputs = {'X': np.random.random(self.ori_shape).astype(self.dtype)}
         self.attrs = {'expand_times': self.expand_times}
         output = np.tile(self.inputs['X'], self.expand_times)
         self.outputs = {'Out': output}
@@ -79,13 +81,16 @@ class TestExpandOpRank1_tensor_attr(OpTest):
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"
+
         expand_times_tensor = []
         for index, ele in enumerate(self.expand_times):
             expand_times_tensor.append(("x" + str(index), np.ones(
                 (1)).astype('int32') * ele))
 
         self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float64"),
+            'X': np.random.random(self.ori_shape).astype(self.dtype),
             'expand_times_tensor': expand_times_tensor,
         }
         self.attrs = {"expand_times": self.infer_expand_times}
@@ -123,9 +128,11 @@ class TestExpandOpRank1_tensor(OpTest):
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"
 
         self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float64"),
+            'X': np.random.random(self.ori_shape).astype(self.dtype),
             'ExpandTimes': np.array(self.expand_times).astype("int32"),
         }
         self.attrs = {}
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index c305f71aa53657..0dd78ea53c27b2 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -375,15 +375,9 @@ def test_errors(self):
                 out=x1)
 
             # The argument dtype of fill_constant_op must be one of bool, float16,
-            #float32, float64, int32 or int64
+            #float32, float64, uint8, int32 or int64
             x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
 
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1],
-                value=5,
-                dtype='uint8')
             self.assertRaises(
                 TypeError,
                 fluid.layers.fill_constant,
diff --git a/python/paddle/fluid/tests/unittests/test_flatten2_op.py b/python/paddle/fluid/tests/unittests/test_flatten2_op.py
index 189a63a0868459..0d50c65558a918 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten2_op.py
@@ -81,7 +81,7 @@ def test_Variable():
         self.assertRaises(TypeError, test_Variable)
 
         def test_type():
-            # dtype must be float32, float64, int8, int32, int64.
+            # dtype must be float32, float64, int8, int32, int64, uint8.
             x2 = fluid.layers.data(
                 name='x2', shape=[3, 2, 4, 5], dtype='float16')
             fluid.layers.flatten(x2, axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index aa85eb3df35270..d6cc6ecffc106b 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -166,11 +166,12 @@ def test_ValueError3():
         self.assertRaises(ValueError, test_ValueError3)
 
         def test_type():
-            # dtype must be float32, float64, int8, int32, int64.
+            # dtype must be float32, float64, int8, int32, int64, uint8.
             x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
                            image_shape[3]).reshape(image_shape) / 100.
             x2 = x2.astype('float16')
-            x2_var = paddle.fluid.data(name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2_var = paddle.fluid.data(
+                name='x2', shape=[3, 2, 4, 5], dtype='float16')
             paddle.flatten(x2_var)
 
         self.assertRaises(TypeError, test_type)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
new file mode 100644
index 00000000000000..b9d88a8e1155e4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import os
+import time
+import six
+import copy
+import json
+import unittest
+import paddle.fluid as fluid
+
+import paddle.distributed.fleet.ascend_utils as ascend_utils
+
+RANK_TABLE_JSON = {
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "1",
+    "server_list": [{
+        "server_id": "127.0.0.1",
+        "device": [{
+            "device_id": "0",
+            "device_ip": "192.1.184.23",
+            "rank_id": "0"
+        }, {
+            "device_id": "1",
+            "device_ip": "192.2.21.93",
+            "rank_id": "1"
+        }]
+    }]
+}
+
+
+class TestAscendUtil(unittest.TestCase):
+    def test_get_cloud_cluster(self):
+        cluster, pod = ascend_utils.get_cloud_cluster()
+        self.assertTrue(cluster)
+        self.assertTrue(pod)
+
+        with open('rank_table_file.json', 'w') as f:
+            json.dump(RANK_TABLE_JSON, f)
+        rank_table_file = "./rank_table_file.json"
+        cluster, pod = ascend_utils.get_cloud_cluster(
+            rank_table_file=rank_table_file)
+        self.assertTrue(cluster)
+        self.assertTrue(pod)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 31771ddbd68744..52895217d3f900 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -73,6 +73,17 @@ def test_pipeline_configs(self):
         strategy.pipeline_configs = configs
         self.assertEqual(strategy.pipeline_configs["accumulate_steps"], 2)
 
+    def test_hybrid_parallel_configs(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": 2,
+            "pp_degree": 4
+        }
+        self.assertEqual(strategy.hybrid_configs["dp_degree"], 1)
+        self.assertEqual(strategy.hybrid_configs["mp_degree"], 2)
+        self.assertEqual(strategy.hybrid_configs["pp_degree"], 4)
+
     def test_localsgd(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.localsgd = True
@@ -179,6 +190,15 @@ def test_last_comm_group_size_MB(self):
         with self.assertRaises(ValueError):
             strategy.last_comm_group_size_MB = -1
 
+    def test_find_unused_parameters(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.find_unused_parameters = True
+        self.assertEqual(strategy.find_unused_parameters, True)
+        strategy.find_unused_parameters = False
+        self.assertEqual(strategy.find_unused_parameters, False)
+        strategy.find_unused_parameters = "True"
+        self.assertEqual(strategy.find_unused_parameters, False)
+
     def test_fuse_grad_size_in_TFLOPS(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy._fuse_grad_size_in_TFLOPS = 0.1
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
new file mode 100644
index 00000000000000..a54334692214c8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+RANK_TABLE_FILE_NAME="rank_table_file.json"
+cat > ${RANK_TABLE_FILE_NAME} <<EOF
+{
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "1",
+    "server_list": [
+        {
+            "server_id": "127.0.0.1",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "192.1.184.23",
+                    "rank_id": "0"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "192.2.21.93",
+                    "rank_id": "1"
+                }
+            ]
+        }
+    ]
+}
+EOF
+
+# set ascend rank table file env
+export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
+
+# use ascend
+echo "begin test use ascend npu"
+
+distributed_args="--run_mode=collective --log_dir=testlog"
+python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
+
+str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
+str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
+file_0="multi_process_fleetlaunchascend.check_0.log"
+file_1="multi_process_fleetlaunchascend.check_1.log"
+
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
+    echo "find trainer 0"
+else
+    echo "not find trainer 0"
+    exit -1
+fi
+
+if grep -q "$str2" "$file_1"; then
+    echo "find trainer 1"
+else
+    echo "not find trainer 1"
+    exit -1
+fi
+
+# test async poll process
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
+if [ -f $file_1 ]; then
+    rm $file_1
+fi
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 5da7e627f8707d..4d1e936558abf7 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -45,6 +45,7 @@ def test_sharding_optimizer(self):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -55,9 +56,9 @@ def test_sharding_optimizer(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum',
+            'momentum', 'momentum'
         ])
 
     def test_sharding_amp_optimizer(self):
@@ -82,6 +83,7 @@ def test_sharding_amp_optimizer(self):
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0",
                 "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0"
             ]))
+
         self.assertEqual(ops, [
             'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
             'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast',
@@ -94,11 +96,10 @@ def test_sharding_amp_optimizer(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast',
             'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
             'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad',
-            'c_sync_calc_stream', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_sync_comm_stream', 'cast', 'cast', 'cast',
-            'check_finite_and_unscale', 'cast', 'c_sync_calc_stream',
-            'c_allreduce_max', 'c_sync_comm_stream', 'cast',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_sync_comm_stream', 'cast', 'cast', 'cast',
+            'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast',
             'update_loss_scaling', 'momentum', 'momentum', 'momentum'
         ])
 
@@ -124,6 +125,7 @@ def test_sharding_recompute_optimizer(self):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -134,10 +136,9 @@ def test_sharding_recompute_optimizer(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'mul',
             'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
             'mul', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad',
-            'mul_grad', 'c_sync_calc_stream', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_sync_comm_stream',
-            'momentum', 'momentum', 'momentum'
+            'mul_grad', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_amp_recompute_optimizer(self):
@@ -167,29 +168,27 @@ def test_sharding_amp_recompute_optimizer(self):
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0",
                 "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0"
             ]))
-
         self.assertEqual(ops, [
-            'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
+            'cast', 'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
-            'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh',
-            'cast', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
-            'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
-            'mean', 'elementwise_mul', 'fill_constant', 'scale',
-            'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast',
-            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast',
-            'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh_grad',
-            'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
-            'cast', 'elementwise_add', 'cast', 'tanh_grad', 'cast',
+            'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add',
+            'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
+            'fill_constant', 'scale', 'elementwise_mul_grad', 'mean_grad',
+            'cross_entropy_grad2', 'cast', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh_grad', 'cast',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'cast', 'cast', 'cast',
-            'check_finite_and_unscale', 'cast', 'c_sync_calc_stream',
-            'c_allreduce_max', 'c_sync_comm_stream', 'cast',
-            'update_loss_scaling', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast',
+            'cast', 'cast', 'check_finite_and_unscale', 'cast',
+            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
+            'momentum', 'momentum'
         ])
 
     def test_sharding_weight_decay(self):
@@ -227,10 +226,10 @@ def test_sharding_weight_decay(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'scale', 'sum', 'scale', 'sum', 'scale',
-            'sum', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'scale',
+            'sum', 'scale', 'sum', 'scale', 'sum', 'momentum', 'momentum',
+            'momentum'
         ])
 
     def test_sharding_gradient_clip(self):
@@ -253,6 +252,7 @@ def test_sharding_gradient_clip(self):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -263,14 +263,12 @@ def test_sharding_gradient_clip(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'square', 'reduce_sum', 'square',
-            'reduce_sum', 'square', 'reduce_sum', 'sum', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_sync_comm_stream', 'sqrt', 'fill_constant',
-            'elementwise_max', 'elementwise_div', 'elementwise_mul',
-            'elementwise_mul', 'elementwise_mul', 'momentum', 'momentum',
-            'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'square',
+            'reduce_sum', 'square', 'reduce_sum', 'square', 'reduce_sum', 'sum',
+            'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
+            'elementwise_div', 'elementwise_mul', 'elementwise_mul',
+            'elementwise_mul', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_clone_for_test(self):
@@ -281,7 +279,8 @@ def test_sharding_clone_for_test(self):
         self.optimizer(avg_cost, strategy, train_prog, startup_prog)
         sharding.utils.comm_analyse(train_prog)
         test_prog = train_prog.clone(for_test=True)
-        sharding.utils.add_sync_comm(test_prog, strategy)
+        # assume sharding_ring_id = 1
+        sharding.utils.add_sync_comm(test_prog, 1)
         ops = [op.type for op in test_prog.global_block().ops]
 
         self.assertEqual(ops, [
@@ -293,5 +292,295 @@ def test_sharding_clone_for_test(self):
         ])
 
 
+class TestFleetMetaOptimizer(TestFleetMetaOptimizer):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "3"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002,127.0.0.1:36003,127.0.0.1:36004"
+
+        # pre-assigned ring id
+        self.mp_ring_id = 0
+        self.sharding_ring_id = 1
+        self.dp_ring_id = 2
+        self.global_ring_id = 3
+        self.pp_ring_id = 20
+
+    def test_sharding_with_mp(self):
+        # NOTE(JZ-LIANG) MP parallelism need user to build model with MP API
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "hybrid_dp": False,
+            "gradient_merge_acc_step": 1,
+            "mp_degree": 2
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # should has ring id for MP
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.mp_ring_id, created_ring_ids)
+
+        # check correctness of MP group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+    def test_sharding_hybrid_dp(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "dp_degree": 2,
+            "hybrid_dp": True,
+            "gradient_merge_acc_step": 1,
+            "mp_degree": 1
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check ring id for outter dp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.dp_ring_id, created_ring_ids)
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of dp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+        # check loss scale for sharding hybrid dp
+        scale_ = -1
+        for op in main_prog_ops:
+            if op.type == "scale":
+                scale_ = float(op.desc.attr("scale"))
+        self.assertEqual(scale_, 0.25)
+
+        # check program (allreudce)
+        ops = [op.type for op in main_prog_ops]
+        self.assertEqual(ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
+        ])
+
+    def test_sharding_hybrid_dp_gm(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "dp_degree": 2,
+            "hybrid_dp": True,
+            "gradient_merge_acc_step": 4,
+            "mp_degree": 1
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check ring id for outter dp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.dp_ring_id, created_ring_ids)
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of dp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+        # check program
+        fw_bw_ops = [op.type for op in train_prog.blocks[0].ops]
+        opt_ops = [op.type for op in train_prog.blocks[2].ops]
+        self.assertEqual(fw_bw_ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'elementwise_add', 'elementwise_add', 'elementwise_add',
+            'increment', 'elementwise_mod', 'equal', 'conditional_block'
+        ])
+        self.assertEqual(opt_ops, [
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'scale',
+            'scale', 'scale', 'momentum', 'momentum', 'momentum',
+            'fill_constant', 'fill_constant', 'fill_constant'
+        ])
+
+        # # check loss scale for gradient merge
+        scale_ = -1
+        for op in train_prog.blocks[2].ops:
+            if op.type == "scale":
+                scale_ = float(op.desc.attr("scale"))
+                self.assertEqual(scale_, 0.25)
+
+    def test_sharding_with_pp(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.pp_net(train_prog, startup_prog)
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.1,
+            "sharding_degree": 2,
+            "hybrid_dp": False,
+            "gradient_merge_acc_step": 4,
+            "mp_degree": 1,
+            "pp_degree": 2
+        }
+        strategy.pipeline = True
+        strategy.pipeline_configs = {
+            "schedule_mode": "1F1B",
+            "micro_batch_size": 2,
+            "accumulate_steps": 4,
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        train_prog = train_prog._pipeline_opt['section_program']
+        startup_prog = startup_prog._pipeline_opt['startup_program']
+
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check program
+        startup_prog_op_types = [op.type for op in startup_prog_ops]
+        main_prog_op_types = [op.type for op in main_prog_ops]
+        self.assertEqual(startup_prog_op_types, [
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'uniform_random',
+            'fill_constant', 'uniform_random', 'fill_constant', 'c_gen_nccl_id',
+            'c_comm_init', 'fill_constant', 'c_allreduce_sum', 'c_gen_nccl_id',
+            'c_comm_init', 'fill_constant', 'c_allreduce_sum', 'c_gen_nccl_id',
+            'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
+        ])
+
+        self.assertEqual(main_prog_op_types, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_sync_comm_stream', 'recv_v2', 'mul',
+            'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
+            'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax',
+            'cross_entropy2', 'mean', 'fill_constant', 'scale', 'scale',
+            'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'c_sync_comm_stream', 'fill_constant', 'sum', 'fill_constant',
+            'sum', 'fill_constant', 'sum', 'fill_constant', 'sum',
+            'fill_constant', 'sum', 'momentum', 'momentum', 'momentum',
+            'momentum', 'momentum'
+        ])
+
+        # should has ring id for pp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.sharding_ring_id, created_ring_ids)
+        self.assertIn(self.pp_ring_id, created_ring_ids)
+
+        # check correctness of pp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index 2d850db7837722..19944aba46df0a 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -84,10 +84,7 @@ def test_errors(self):
                 TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint4')
 
             # The argument dtype of full must be one of bool, float16,
-            #float32, float64, int32 or int64
-
-            self.assertRaises(
-                TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint8')
+            #float32, float64, uint8, int32 or int64
 
             # The argument shape's type of full_op  must be list, tuple or Variable.
             def test_shape_type():
diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
index d854372bbc6e7d..2b51bec9cb0e7d 100644
--- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -26,7 +26,10 @@
 class TestGPUPackagePaddle(unittest.TestCase):
     def test_import_paddle(self):
         if core.is_compiled_with_cuda():
-            os.environ['CUDA_VISIBLE_DEVICES'] = ''
+            if core.is_compiled_with_rocm():
+                os.environ['HIP_VISIBLE_DEVICES'] = ''
+            else:
+                os.environ['CUDA_VISIBLE_DEVICES'] = ''
             test_file = 'test_no_gpu_run_rand.py'
             with open(test_file, 'w') as wb:
                 cmd_test = """
diff --git a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
index eb1fed81cbee58..9f18ec9843d7a4 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
@@ -44,8 +44,9 @@ def get_weight_names(self):
 
     def setUp(self):
         self.op_type = "rnn"
-        self.dtype = "float64"
-        self.sequence_length = np.array(
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
+        self.sequence_length = None if core.is_compiled_with_rocm(
+        ) else np.array(
             [12, 11, 10, 9, 8, 7, 6, 5], dtype=np.int32)
         self.num_layers = 1
         self.is_bidirec = False
@@ -83,6 +84,24 @@ def setUp(self):
 
         output, last_hidden = rnn1(input, sequence_length=self.sequence_length)
 
+        if core.is_compiled_with_rocm():
+
+            def rocm_rnn_get_place():
+                places = [core.CUDAPlace(0)]
+                return places
+
+            self._get_places = rocm_rnn_get_place
+
+            if self.is_bidirec:
+                for i in range(0, len(flat_w), 4):
+                    flat_w[i + 1], flat_w[i + 2] = flat_w[i + 2], flat_w[i + 1]
+
+            for i in range(len(flat_w)):
+                w = np.split(flat_w[i][1], 3, 0)
+                w = [w[1], w[0], w[2]]
+                w = np.concatenate(w)
+                flat_w[i] = (flat_w[i][0], w)
+
         init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
                            self.hidden_size)).astype(self.dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
index a570e266072adc..74afa7db2899b4 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -121,12 +121,12 @@ def set_inputs(self, origin_mode=False):
         self.op_type = 'gru_unit'
         self.inputs = {
             'Input': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size * 3)).astype('float64'),
+                -0.1, 0.1, (batch_size, frame_size * 3)).astype(self.dtype),
             'HiddenPrev': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size)).astype('float64'),
+                -0.1, 0.1, (batch_size, frame_size)).astype(self.dtype),
             'Weight': np.random.uniform(
                 -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size),
-                (frame_size, frame_size * 3)).astype('float64'),
+                (frame_size, frame_size * 3)).astype(self.dtype),
         }
         self.attrs = {
             'activation': GRUActivationType.tanh,
@@ -161,12 +161,14 @@ def set_outputs(self, origin_mode=False):
         else:
             h = u * c + (1 - u) * h_p
         self.outputs = {
-            'Gate': g.astype('float64'),
-            'ResetHiddenPrev': r_h_p.astype('float64'),
-            'Hidden': h.astype('float64')
+            'Gate': g.astype(self.dtype),
+            'ResetHiddenPrev': r_h_p.astype(self.dtype),
+            'Hidden': h.astype(self.dtype)
         }
 
     def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
         self.set_inputs()
         self.set_outputs()
 
@@ -179,6 +181,8 @@ def test_check_grad(self):
 
 class TestGRUUnitOpOriginMode(TestGRUUnitOp):
     def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
         self.set_inputs(origin_mode=True)
         self.set_outputs(origin_mode=True)
 
@@ -189,7 +193,7 @@ def set_inputs(self, origin_mode=False):
         frame_size = self.frame_size
         super(TestGRUUnitOpWithBias, self).set_inputs()
         self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
+            -0.1, 0.1, (1, frame_size * 3)).astype(self.dtype)
         self.attrs = {
             'activation': GRUActivationType.identity,
             'gate_activation': GRUActivationType.sigmoid,
@@ -207,6 +211,8 @@ def test_check_grad_ingore_input(self):
 
 class TestGRUUnitOpWithBiasOriginMode(TestGRUUnitOpWithBias):
     def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
         self.set_inputs(origin_mode=True)
         self.set_outputs(origin_mode=True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
new file mode 100644
index 00000000000000..e4c469599d72c0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+import unittest
+from paddle.distributed import fleet
+import numpy as np
+
+
+class TestCommunicateTopology(unittest.TestCase):
+    def test_topology(self):
+        topo = fleet.CommunicateTopology(["dp", "mp", "pp"], [2, 2, 2])
+
+        # test get_comm_list
+        dp_comm_list = [[0, 4], [1, 5], [2, 6], [3, 7]]
+        mp_comm_list = [[0, 2], [1, 3], [4, 6], [5, 7]]
+        pp_comm_list = [[0, 1], [2, 3], [4, 5], [6, 7]]
+
+        np.testing.assert_array_equal(dp_comm_list, topo.get_comm_list("dp"))
+        np.testing.assert_array_equal(mp_comm_list, topo.get_comm_list("mp"))
+        np.testing.assert_array_equal(pp_comm_list, topo.get_comm_list("pp"))
+
+        # test get_hybrid_group_names
+        parallel_names = ["dp", "mp", "pp"]
+        np.testing.assert_array_equal(parallel_names,
+                                      topo.get_hybrid_group_names())
+
+        # test get_dims
+        np.testing.assert_array_equal(2, topo.get_dim("dp"))
+        np.testing.assert_array_equal(2, topo.get_dim("mp"))
+        np.testing.assert_array_equal(2, topo.get_dim("pp"))
+
+        # test world size
+        self.assertEqual(topo.world_size(), 8)
+
+        # test get_rank
+        self.assertEqual(topo.get_rank(dp=0, mp=0, pp=0), 0)
+        self.assertEqual(topo.get_rank(dp=0, mp=0, pp=1), 1)
+        self.assertEqual(topo.get_rank(dp=0, mp=1, pp=0), 2)
+        self.assertEqual(topo.get_rank(dp=0, mp=1, pp=1), 3)
+        self.assertEqual(topo.get_rank(dp=1, mp=0, pp=0), 4)
+        self.assertEqual(topo.get_rank(dp=1, mp=0, pp=1), 5)
+        self.assertEqual(topo.get_rank(dp=1, mp=1, pp=0), 6)
+        self.assertEqual(topo.get_rank(dp=1, mp=1, pp=1), 7)
+
+        # test get_coord
+        self.assertEqual(topo.get_coord(0), topo.coordinate(0, 0, 0))
+        self.assertEqual(topo.get_coord(1), topo.coordinate(0, 0, 1))
+        self.assertEqual(topo.get_coord(2), topo.coordinate(0, 1, 0))
+        self.assertEqual(topo.get_coord(3), topo.coordinate(0, 1, 1))
+        self.assertEqual(topo.get_coord(4), topo.coordinate(1, 0, 0))
+        self.assertEqual(topo.get_coord(5), topo.coordinate(1, 0, 1))
+        self.assertEqual(topo.get_coord(6), topo.coordinate(1, 1, 0))
+        self.assertEqual(topo.get_coord(7), topo.coordinate(1, 1, 1))
+
+        # test get_axis_list
+        self.assertEqual(topo.get_axis_list("dp", 0), [0, 1, 2, 3])
+        self.assertEqual(topo.get_axis_list("dp", 1), [4, 5, 6, 7])
+        self.assertEqual(topo.get_axis_list("mp", 0), [0, 1, 4, 5])
+        self.assertEqual(topo.get_axis_list("mp", 1), [2, 3, 6, 7])
+        self.assertEqual(topo.get_axis_list("pp", 0), [0, 2, 4, 6])
+        self.assertEqual(topo.get_axis_list("pp", 1), [1, 3, 5, 7])
+
+        # test get_dim_size
+        self.assertEqual(topo.get_dim_size("dp"), 2)
+        self.assertEqual(topo.get_dim_size("mp"), 2)
+        self.assertEqual(topo.get_dim_size("pp"), 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py
new file mode 100644
index 00000000000000..9cd3c6a8fb5544
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+from collections import OrderedDict
+
+
+class TestLayerDict(unittest.TestCase):
+    def test_layer_dict(self):
+        layers = OrderedDict([
+            ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+            ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+        ])
+
+        layers_dicts = paddle.nn.LayerDict(sublayers=layers)
+
+        def check_layer_dict():
+            self.assertEqual(len(layers), len(layers_dicts))
+
+            for k1, k2 in zip(layers, layers_dicts):
+                self.assertIs(layers[k1], layers_dicts[k2])
+
+            for k, v in zip(layers, layers_dicts.children()):
+                self.assertIs(layers[k], v)
+
+            for k in layers_dicts:
+                self.assertIs(layers[k], layers_dicts[k])
+
+            for k in layers.keys():
+                self.assertTrue(k in layers_dicts)
+
+            for k1, k2 in zip(layers.keys(), layers_dicts.keys()):
+                self.assertEqual(k1, k2)
+
+            for k, v in layers_dicts.items():
+                self.assertIs(layers[k], v)
+
+            for v1, v2 in zip(layers.values(), layers_dicts.values()):
+                self.assertIs(v1, v2)
+
+        check_layer_dict()
+
+        layers['linear'] = paddle.nn.Linear(2, 4)
+        layers_dicts['linear'] = layers['linear']
+        check_layer_dict()
+
+        sublayer = OrderedDict([
+            ('sigmod', paddle.nn.Sigmoid()),
+            ('relu', paddle.nn.ReLU()),
+        ])
+        layers.update(sublayer)
+        layers_dicts.update(sublayer)
+        check_layer_dict()
+
+        del layers['conv1d']
+        del layers_dicts['conv1d']
+        check_layer_dict()
+
+        l = layers_dicts.pop('linear')
+        self.assertIs(layers['linear'], l)
+        layers.pop('linear')
+        check_layer_dict()
+
+        layers_dicts.clear()
+        self.assertEqual(0, len(layers_dicts))
+        layers.clear()
+        check_layer_dict()
+
+        list_format_layers = [
+            ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+            ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+        ]
+        layers = OrderedDict(list_format_layers)
+        layers_dicts.update(list_format_layers)
+        check_layer_dict()
+
+    def test_layer_dict_error_inputs(self):
+        layers = [
+            ('conv1d', paddle.nn.Conv1D(3, 2, 3), "conv1d"),
+            ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+        ]
+
+        layers_dicts = paddle.nn.LayerDict()
+        self.assertRaises(ValueError, layers_dicts.update, layers)
+
+        self.assertRaises(AssertionError, layers_dicts.update, 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
index ef90dd049869ad..2e722b69c3ea02 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
@@ -84,6 +84,18 @@ def layer_list(self, use_fluid_api):
             self.assertListEqual(res8.shape, [5, 3**3])
             res8.backward()
 
+            model4 = MyLayer(layerlist[:3])
+            model4.layerlist[-1] = fluid.dygraph.Linear(4, 5)
+            res9 = model4(x)
+            self.assertListEqual(res9.shape, [5, 5])
+            del model4.layerlist[-1]
+            res10 = model4(x)
+            self.assertListEqual(res10.shape, [5, 4])
+            model4.layerlist.insert(-1, fluid.dygraph.Linear(2, 2))
+            res11 = model4(x)
+            self.assertListEqual(res11.shape, [5, 4])
+            res11.backward()
+
     def test_layer_list(self):
         self.layer_list(True)
         self.layer_list(False)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index cd019c920756f3..36c4d67bf2d813 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -190,10 +190,18 @@ def _check_mlp(self, place=None):
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
 
-        self.assertTrue(np.allclose(static_out, dy_out))
+        if core.is_compiled_with_rocm():
+            self.assertTrue(np.allclose(static_out, dy_out, atol=1e-3))
+        else:
+            self.assertTrue(np.allclose(static_out, dy_out))
 
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            if core.is_compiled_with_rocm():
+                self.assertTrue(
+                    np.allclose(
+                        value, dy_param_value[key], atol=1e-3))
+            else:
+                self.assertTrue(np.allclose(value, dy_param_value[key]))
 
 
 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index e3d82888f6160d..eac627d1b5b076 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -207,10 +207,18 @@ def _check_mlp(self, place=None):
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
 
-        self.assertTrue(np.allclose(static_out, dy_out))
+        if core.is_compiled_with_rocm():
+            self.assertTrue(np.allclose(static_out, dy_out, atol=1e-3))
+        else:
+            self.assertTrue(np.allclose(static_out, dy_out))
 
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            if core.is_compiled_with_rocm():
+                self.assertTrue(
+                    np.allclose(
+                        value, dy_param_value[key], atol=1e-3))
+            else:
+                self.assertTrue(np.allclose(value, dy_param_value[key]))
 
 
 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 672ffa9d394184..9f0dcdb4d8f0c2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -930,7 +930,7 @@ def test_state_shape_mismatch(self):
             paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
             para_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy.pdparams'))
+                os.path.join('saved_dy', 'emb_dy.pdparams'), return_numpy=True)
             para_state_dict['weight'] = np.expand_dims(
                 para_state_dict['weight'], axis=-1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
index 2a74d29e1ee98f..645a05e75f6fba 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
@@ -75,10 +75,12 @@ def test_main(self):
 
         self.assertEqual(actual_persistable_vars, expected_persistable_vars)
 
-        dirname = './traced_layer_test_non_persistable_vars'
-        traced_layer.save_inference_model(dirname=dirname)
-        filenames = set([f for f in os.listdir(dirname) if f != '__model__'])
-        self.assertEqual(filenames, expected_persistable_vars)
+        traced_layer.save_inference_model(
+            path='./traced_layer_test_non_persistable_vars')
+        self.assertTrue('traced_layer_test_non_persistable_vars.pdmodel' in
+                        os.listdir('./'))
+        self.assertTrue('traced_layer_test_non_persistable_vars.pdiparams' in
+                        os.listdir('./'))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index fba16959901a88..c35188623b4400 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -44,8 +44,10 @@ def run_imperative(self):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=[10, 10, 5], dtype='float32')
         result0 = paddle.nn.functional.l1_loss(input, label)
         result1 = paddle.nn.functional.l1_loss(input, label, reduction='sum')
         result2 = paddle.nn.functional.l1_loss(input, label, reduction='none')
@@ -127,8 +129,10 @@ def run_imperative(self):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=[10, 10, 5], dtype='float32')
         l1_loss = paddle.nn.loss.L1Loss()
         result0 = l1_loss(input, label)
         l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
index bee230fba5a7e2..d9d64e4dfa693a 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 unset https_proxy http_proxy
 
 nohup python -u test_listen_and_serv_op.py > test_listen_and_serv_op.log 2>&1 &
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 82443f8c5493b4..372b8d0d4d2766 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -390,8 +390,10 @@ def get_weight_names(self):
 
     def setUp(self):
         self.op_type = "cudnn_lstm"
-        self.dtype = np.float64
-        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.sequence_length = None if core.is_compiled_with_rocm(
+        ) else np.array(
+            [12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
         self.set_attrs()
 
@@ -447,6 +449,13 @@ def setUp(self):
                            hidden_size)).astype(self.dtype)
         state_out = np.ndarray((300)).astype("uint8")
 
+        if core.is_compiled_with_rocm():
+            for i in range(len(flat_w)):
+                w = np.split(flat_w[i][1], 4, 0)
+                w = [w[0], w[1], w[3], w[2]]
+                w = np.concatenate(w)
+                flat_w[i] = (flat_w[i][0], w)
+
         self.inputs = {
             'Input': input,
             'WeightList': flat_w,
@@ -454,6 +463,13 @@ def setUp(self):
             'InitC': init_c,
             'SequenceLength': self.sequence_length
         }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'InitH': init_h,
+                'InitC': init_c,
+            }
         self.attrs = {
             'dropout_prob': 0.0,
             'is_bidirec': False,
@@ -474,8 +490,12 @@ def set_attrs(self):
 
     def test_output_with_place(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(
-            place, no_check_set=['Reserve', 'StateOut'])
+        if core.is_compiled_with_rocm():
+            self.check_output_with_place(
+                place, atol=1e-5, no_check_set=['Reserve', 'StateOut'])
+        else:
+            self.check_output_with_place(
+                place, no_check_set=['Reserve', 'StateOut'])
 
     def test_grad_with_place(self):
         place = core.CUDAPlace(0)
@@ -496,14 +516,13 @@ def test_lstm(self):
         hidden_size = 20
         dropout_prob = 0.0
         num_layers = 1
+        dtype = 'float32' if core.is_compiled_with_rocm() else 'float64'
         input = fluid.data(
-            name='input',
-            shape=[seq_len, batch_size, hidden_size],
-            dtype='float64')
+            name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype)
         init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
         init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
         rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
                                               hidden_size, num_layers,
                                               dropout_prob, False)
@@ -526,14 +545,13 @@ def test_lstm(self):
         hidden_size = 20
         dropout_prob = 0.0
         num_layers = 2
+        dtype = 'float32' if core.is_compiled_with_rocm() else 'float64'
         input = fluid.data(
-            name='input',
-            shape=[seq_len, batch_size, hidden_size],
-            dtype='float64')
+            name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype)
         init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
         init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
         rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
                                               hidden_size, num_layers,
                                               dropout_prob, False, True)
@@ -541,7 +559,7 @@ def test_lstm(self):
         exe.run(fluid.default_startup_program())
         input_i = np.random.uniform(
             low=-0.1, high=0.1, size=(seq_len, batch_size,
-                                      hidden_size)).astype("float64")
+                                      hidden_size)).astype(dtype)
         out = exe.run(fluid.default_main_program(),
                       feed={'input': input_i},
                       fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index 2d5f098a7fe864..b936567d5b5a81 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -206,6 +206,42 @@ def inject_test(dim_x, dim_y, trans_x, trans_y):
                 api_test(dim_X, dim_Y, transose_x, transose_y)
 
 
+# Test case more batch_size and N, M, K
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y,
+                               batch_size):
+    BATCH_SIZE = 2
+    M = 3
+    N = 4
+    K = 5
+    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
+        K = 1
+    if dim_X == 1:
+        if transpose_X:
+            shape_X = [M]
+        else:
+            shape_X = [K]
+    if dim_Y == 1:
+        if transpose_Y:
+            shape_Y = [N]
+        else:
+            shape_Y = [K]
+    if dim_X >= 2:
+        if transpose_X:
+            shape_X = [K, M]
+        else:
+            shape_X = [M, K]
+    if dim_X == 3:
+        shape_X = [BATCH_SIZE] + shape_X
+    if dim_Y >= 2:
+        if transpose_Y:
+            shape_Y = [N, K]
+        else:
+            shape_Y = [K, N]
+    if dim_Y == 3:
+        shape_Y = [BATCH_SIZE] + shape_Y
+    return shape_X, shape_Y
+
+
 # Test case n-dim
 def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     M = 2
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index cf273876b1f2f8..bea2f6c8b38b23 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -17,6 +17,8 @@
 
 from __future__ import print_function
 import paddle
+paddle.enable_static()
+
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
@@ -52,6 +54,11 @@ def test_dataset_run_with_stat(self):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
+        embs = []
+        for x in slots_vars:
+            emb = fluid.layers.embedding(x, is_sparse=True, size=[100001, 4])
+            embs.append(emb)
+
         dataset = paddle.distributed.InMemoryDataset()
         dataset._set_batch_size(32)
         dataset._set_thread(3)
@@ -74,11 +81,17 @@ def test_dataset_run_with_stat(self):
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
+
         else:
             for i in range(self.epoch_num):
                 try:
-                    exe.train_from_dataset(fluid.default_main_program(),
-                                           dataset)
+                    exe.train_from_dataset(
+                        fluid.default_main_program(),
+                        dataset,
+                        fetch_list=[embs[0], embs[1]],
+                        fetch_info=["emb0", "emb1"],
+                        print_period=1)
+
                 except Exception as e:
                     self.assertTrue(False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index bc5d35d3254bc4..89eef6ca24243c 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -191,8 +191,10 @@ def test_NNFunctionalMseLoss_mean(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'mean')
 
             exe = paddle.static.Executor(place)
@@ -225,8 +227,10 @@ def test_NNFunctionalMseLoss_sum(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'sum')
 
                 exe = paddle.static.Executor(place)
@@ -259,8 +263,10 @@ def test_NNFunctionalMseLoss_none(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'none')
 
                 exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
index 0533a0d09fa0de..3bb3e843b1b11a 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -160,5 +160,6 @@ def run_main(self, num_workers, places):
         print("time cost", ret['time'], 'step_list', ret['step'])
         return ret
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index 1f88568b5bc8ed..04962a93c11c1e 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -21,6 +21,7 @@
 import paddle.fluid as fluid
 import paddle.nn as nn
 import paddle
+from paddle.nn.functional import interpolate
 
 
 def nearest_neighbor_interp_np(X,
@@ -526,6 +527,28 @@ def test_case(self):
             self.assertTrue(np.allclose(results[i + 1], expect_res))
 
 
+class TestNearestInterpOpAPI_dy(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("int64")
+            scale_np = np.array([2, 2]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            scale = paddle.to_tensor(scale_np)
+            expect_res = nearest_neighbor_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x,
+                scale_factor=scale,
+                mode="nearest",
+                align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
 class TestNearestInterpException(unittest.TestCase):
     def test_exception(self):
         input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_new_group.sh b/python/paddle/fluid/tests/unittests/test_new_group.sh
new file mode 100755
index 00000000000000..4914183fb46f9a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_new_group.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  new_group.py
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  hybrid_parallel_communicate_group.py
diff --git a/python/paddle/fluid/tests/unittests/test_new_group_api.py b/python/paddle/fluid/tests/unittests/test_new_group_api.py
new file mode 100644
index 00000000000000..b9b80d3b431eae
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_new_group_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_nccl(self):
+        self.check_with_place("collective_allreduce_new_group_api.py",
+                              "allreduce", "nccl")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 06f63d1416b8f4..b58d63969a5e5c 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -18,10 +18,15 @@
 import numpy as np
 import os
 import sys
+import six
 
 import paddle
 import paddle.nn as nn
 import paddle.optimizer as opt
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import Adam
+import paddle.fluid.framework as framework
+from test_imperative_base import new_program_scope
 
 BATCH_SIZE = 16
 BATCH_NUM = 4
@@ -31,7 +36,10 @@
 IMAGE_SIZE = 784
 CLASS_NUM = 10
 
-LARGE_PARAM = 2**26
+if six.PY2:
+    LARGE_PARAM = 2**2
+else:
+    LARGE_PARAM = 2**26
 
 
 def random_batch_reader():
@@ -95,15 +103,22 @@ def test_large_parameters_paddle_save(self):
 
         path = os.path.join("test_paddle_save_load_large_param_save",
                             "layer.pdparams")
-        paddle.save(layer.state_dict(), path)
+        if six.PY2:
+            protocol = 2
+        else:
+            protocol = 4
+        paddle.save(save_dict, path, protocol=protocol)
         dict_load = paddle.load(path)
         # compare results before and after saving
         for key, value in save_dict.items():
-            self.assertTrue(np.array_equal(dict_load[key], value.numpy()))
+            self.assertTrue(
+                np.array_equal(dict_load[key].numpy(), value.numpy()))
 
 
 class TestSaveLoadPickle(unittest.TestCase):
     def test_pickle_protocol(self):
+        # enable dygraph mode
+        paddle.disable_static()
         # create network
         layer = LinearNet()
         save_dict = layer.state_dict()
@@ -124,11 +139,236 @@ def test_pickle_protocol(self):
         if sys.version_info.major >= 3 and sys.version_info.minor >= 4:
             protocols += [3, 4]
         for protocol in protocols:
-            paddle.save(save_dict, path, protocol)
+            paddle.save(save_dict, path, pickle_protocol=protocol)
             dict_load = paddle.load(path)
             # compare results before and after saving
             for key, value in save_dict.items():
-                self.assertTrue(np.array_equal(dict_load[key], value.numpy()))
+                self.assertTrue(
+                    np.array_equal(dict_load[key].numpy(), value.numpy()))
+
+
+class TestSaveLoadAny(unittest.TestCase):
+    def set_zero(self, prog, place, scope=None):
+        if scope is None:
+            scope = fluid.global_scope()
+        for var in prog.list_vars():
+            if isinstance(var, framework.Parameter) or var.persistable:
+                ten = scope.find_var(var.name).get_tensor()
+                if ten is not None:
+                    ten.set(np.zeros_like(np.array(ten)), place)
+                    new_t = np.array(scope.find_var(var.name).get_tensor())
+                    self.assertTrue(np.sum(np.abs(new_t)) == 0)
+
+    def replace_static_save(self, program, model_path, pickle_protocol=2):
+        with self.assertRaises(TypeError):
+            program.state_dict(1)
+        with self.assertRaises(TypeError):
+            program.state_dict(scope=1)
+        with self.assertRaises(ValueError):
+            program.state_dict('x')
+        state_dict_param = program.state_dict('param')
+        paddle.save(state_dict_param, model_path + '.pdparams')
+        state_dict_opt = program.state_dict('opt')
+        paddle.save(state_dict_opt, model_path + '.pdopt')
+        state_dict_all = program.state_dict()
+        paddle.save(state_dict_opt, model_path + '.pdall')
+
+    def replace_static_load(self, program, model_path):
+        with self.assertRaises(TypeError):
+            program.set_state_dict(1)
+        state_dict_param = paddle.load(model_path + '.pdparams')
+        state_dict_param['fake_var_name.@@'] = np.random.randn(1, 2)
+        state_dict_param['static_x'] = 'UserWarning'
+        program.set_state_dict(state_dict_param)
+        state_dict_param['static_x'] = np.random.randn(1, 2)
+        program.set_state_dict(state_dict_param)
+        program.set_state_dict(state_dict_param)
+        state_dict_opt = paddle.load(model_path + '.pdopt')
+        program.set_state_dict(state_dict_opt)
+
+    def test_replace_static_save_load(self):
+        paddle.enable_static()
+        with new_program_scope():
+            x = paddle.static.data(
+                name="static_x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+            z = paddle.static.nn.fc(z, 10, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            opt = Adam(learning_rate=1e-3)
+            opt.minimize(loss)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            fake_inputs = np.random.randn(2, IMAGE_SIZE).astype('float32')
+            exe.run(prog, feed={'static_x': fake_inputs}, fetch_list=[loss])
+            base_map = {}
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    t = np.array(fluid.global_scope().find_var(var.name)
+                                 .get_tensor())
+                    base_map[var.name] = t
+            path = os.path.join("test_replace_static_save_load", "model")
+            # paddle.save, legacy paddle.fluid.load
+            self.replace_static_save(prog, path)
+            self.set_zero(prog, place)
+            paddle.fluid.io.load(prog, path)
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, np.array(base_t)))
+            # legacy paddle.fluid.save, paddle.load 
+            paddle.fluid.io.save(prog, path)
+            self.set_zero(prog, place)
+            self.replace_static_load(prog, path)
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, base_t))
+            # test for return tensor
+            path_vars = 'test_replace_save_load_return_tensor_static/model'
+            for var in prog.list_vars():
+                if var.persistable:
+                    tensor = var.get_value(fluid.global_scope())
+                    paddle.save(tensor, os.path.join(path_vars, var.name))
+            with self.assertRaises(TypeError):
+                var.get_value('fluid.global_scope()')
+            with self.assertRaises(ValueError):
+                x.get_value()
+            with self.assertRaises(TypeError):
+                x.set_value('1')
+            fake_data = np.zeros([3, 2, 1, 2, 3])
+            with self.assertRaises(TypeError):
+                x.set_value(fake_data, '1')
+            with self.assertRaises(ValueError):
+                x.set_value(fake_data)
+            with self.assertRaises(ValueError):
+                var.set_value(fake_data)
+            # set var to zero
+            self.set_zero(prog, place)
+            for var in prog.list_vars():
+                if var.persistable:
+                    tensor = paddle.load(
+                        os.path.join(path_vars, var.name), return_numpy=False)
+                    var.set_value(tensor)
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, base_t))
+
+    def test_paddle_save_load_v2(self):
+        paddle.disable_static()
+        layer = LinearNet()
+        state_dict = layer.state_dict()
+        path = 'paddle_save_load_v2/model.pdparams'
+        with self.assertRaises(TypeError):
+            paddle.save(state_dict, path, use_binary_format='False')
+        # legacy paddle.save, paddle.load
+        paddle.framework.io._legacy_save(state_dict, path)
+        load_dict_tensor = paddle.load(path, return_numpy=False)
+        # legacy paddle.load, paddle.save
+        paddle.save(state_dict, path)
+        load_dict_np = paddle.framework.io._legacy_load(path)
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
+            self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
+
+    def test_single_pickle_var_dygraph(self):
+        # enable dygraph mode
+        paddle.disable_static()
+        layer = LinearNet()
+        path = 'paddle_save_load_v2/var_dygraph'
+        tensor = layer._linear.weight
+        with self.assertRaises(ValueError):
+            paddle.save(tensor, path, pickle_protocol='3')
+        with self.assertRaises(ValueError):
+            paddle.save(tensor, path, pickle_protocol=5)
+        paddle.save(tensor, path)
+        t_dygraph = paddle.load(path)
+        np_dygraph = paddle.load(path, return_numpy=True)
+        self.assertTrue(isinstance(t_dygraph, paddle.fluid.core.VarBase))
+        self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph))
+        self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy()))
+        paddle.enable_static()
+        lod_static = paddle.load(path)
+        np_static = paddle.load(path, return_numpy=True)
+        self.assertTrue(isinstance(lod_static, paddle.fluid.core.LoDTensor))
+        self.assertTrue(np.array_equal(tensor.numpy(), np_static))
+        self.assertTrue(np.array_equal(tensor.numpy(), np.array(lod_static)))
+
+    def test_single_pickle_var_static(self):
+        # enable static mode
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 128)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [IMAGE_SIZE, 128]:
+                    tensor = var.get_value()
+                    break
+            scope = fluid.global_scope()
+        origin_tensor = np.array(tensor)
+        path = 'test_single_pickle_var_static/var'
+        paddle.save(tensor, path)
+        self.set_zero(prog, place, scope)
+        # static load
+        lod_static = paddle.load(path)
+        np_static = paddle.load(path, return_numpy=True)
+        # set_tensor(np.ndarray)
+        var.set_value(np_static, scope)
+        self.assertTrue(np.array_equal(origin_tensor, np.array(tensor)))
+        # set_tensor(LoDTensor)
+        self.set_zero(prog, place, scope)
+        var.set_value(lod_static, scope)
+        self.assertTrue(np.array_equal(origin_tensor, np.array(tensor)))
+        # enable dygraph mode
+        paddle.disable_static()
+        var_dygraph = paddle.load(path)
+        np_dygraph = paddle.load(path, return_numpy=True)
+        self.assertTrue(np.array_equal(np.array(tensor), np_dygraph))
+        self.assertTrue(np.array_equal(np.array(tensor), var_dygraph.numpy()))
+
+    def test_dygraph_save_static_load(self):
+        inps = np.random.randn(1, IMAGE_SIZE).astype('float32')
+        path = 'test_dygraph_save_static_load/dy-static.pdparams'
+        paddle.disable_static()
+        with paddle.utils.unique_name.guard():
+            layer = LinearNet()
+            state_dict_dy = layer.state_dict()
+            paddle.save(state_dict_dy, path)
+        paddle.enable_static()
+        with new_program_scope():
+            layer = LinearNet()
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32')
+            y_static = layer(data)
+            program = paddle.static.default_main_program()
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            exe.run(paddle.static.default_startup_program())
+            state_dict = paddle.load(path, keep_name_table=True)
+            program.set_state_dict(state_dict)
+            state_dict_param = program.state_dict("param")
+            for name, tensor in state_dict_dy.items():
+                self.assertTrue(
+                    np.array_equal(tensor.numpy(),
+                                   np.array(state_dict_param[tensor.name])))
 
 
 class TestSaveLoad(unittest.TestCase):
@@ -158,7 +398,9 @@ def build_and_train_model(self):
 
     def check_load_state_dict(self, orig_dict, load_dict):
         for var_name, value in orig_dict.items():
-            self.assertTrue(np.array_equal(value.numpy(), load_dict[var_name]))
+            load_value = load_dict[var_name].numpy() if hasattr(
+                load_dict[var_name], 'numpy') else np.array(load_dict[var_name])
+            self.assertTrue(np.array_equal(value.numpy(), load_value))
 
     def test_save_load(self):
         layer, opt = self.build_and_train_model()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
new file mode 100644
index 00000000000000..fa571bde5e43bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import unittest
+
+import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestDygraphControlFlowSame(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_control_flow_same.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestFleetDygraphControlFlowSame(TestDygraphControlFlowSame):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+
+
+class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._accumulate_gradient = True
+
+
+class TestDygraphControlFlowDiff(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_control_flow_different.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestFleetDygraphControlFlowDiff(TestDygraphControlFlowDiff):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+
+
+class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._accumulate_gradient = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
new file mode 100644
index 00000000000000..5491b451368c82
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import time
+import paddle.fluid as fluid
+
+from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, start_local_trainers
+
+
+def get_cluster_from_args(selected_gpus):
+    cluster_node_ips = '127.0.0.1'
+    node_ip = '127.0.0.1'
+
+    node_ips = [x.strip() for x in cluster_node_ips.split(',')]
+
+    node_ips.index(node_ip)
+
+    free_ports = None
+
+    free_ports = find_free_ports(len(selected_gpus))
+    if free_ports is not None:
+        free_ports = list(free_ports)
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
+
+
+def get_gpus(selected_gpus):
+    selected_gpus = [x.strip() for x in selected_gpus.split(',')]
+    return selected_gpus
+
+
+class TestMultipleGpus(unittest.TestCase):
+    def run_mnist_2gpu(self, target_file_name):
+        if not fluid.core.is_compiled_with_cuda(
+        ) or fluid.core.get_cuda_device_count() == 0:
+            return
+
+        selected_gpus = get_gpus('0,1')
+        cluster = None
+        pod = None
+
+        cluster, pod = get_cluster_from_args(selected_gpus)
+
+        procs = start_local_trainers(
+            cluster,
+            pod,
+            training_script=target_file_name,
+            training_script_args=[])
+
+        while True:
+            alive = watch_local_trainers(procs, cluster.trainers_nranks())
+
+            if not alive:
+                print("Local procs complete, POD info:{}".format(pod))
+                break
+            time.sleep(3)
+
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/setup_install.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
similarity index 51%
rename from python/paddle/fluid/tests/custom_op/setup_install.py
rename to python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
index 18fbfbaf8b64b3..6454b3918ef608 100644
--- a/python/paddle/fluid/tests/custom_op/setup_install.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
@@ -11,19 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-
-from utils import paddle_includes, extra_compile_args
-from paddle.utils.cpp_extension import CUDAExtension, setup
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-setup(
-    name='custom_relu2',
-    ext_modules=CUDAExtension(  # test for not specific name here.
-        sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc',
-                 'relu_op3.cu'],  # test for multi ops
-        include_dirs=paddle_includes,
-        extra_compile_args=extra_compile_args))
+
+from __future__ import print_function
+
+import unittest
+import time
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestHybridParallel(TestMultipleGpus):
+    def test_hybrid_parallel_mp_layers(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
+
+    def test_hybrid_parallel_mp_random(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index a3a3c5bfe3df59..782d2304619f2a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -73,6 +73,7 @@ def _setup_config(self):
         self._dygraph = True
         self._use_fleet_api = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = False
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
index bef64385f135b3..e0aab8541a542c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
@@ -54,6 +54,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = False
 
     def test_transformer(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
index 5906114cd24f32..75fa6f7c71d0a5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
@@ -26,13 +26,13 @@
 flag_name = os.path.splitext(__file__)[0]
 
 
-class TestParallelDygraphMnist(TestDistBase):
+class TestParallelDygraphUnusedVar(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
 
-    def test_mnist(self):
+    def test_net(self):
         if fluid.core.is_compiled_with_cuda():
             self.check_with_place(
                 "parallel_dygraph_unused_variables.py",
@@ -41,6 +41,14 @@ def test_mnist(self):
                 log_name=flag_name)
 
 
+class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+
+
 class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
     def test_mnist_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
@@ -48,17 +56,31 @@ def test_mnist_with_spawn(self):
                 test_class=TestSparseEmbeddingUnusedVars, delta=1e-5)
 
 
-class TestFleetDygraphMnist(TestDistBase):
+class TestParallelDygraphNoVar(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_none_var.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestParallelDygraphSharedUnusedVariables(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
-        self._use_fleet_api = True
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
             self.check_with_place(
-                "parallel_dygraph_unused_variables.py",
+                "parallel_dygraph_shared_unused_var.py",
                 delta=1e-5,
                 check_error_log=True,
                 log_name=flag_name)
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index f75d6e9df540b5..f1a409c712fc32 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -97,8 +97,10 @@ def test_static_graph_functional(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(
+                name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=[2, 4, 4, 9], dtype="float64")
             out_1 = F.pixel_shuffle(x_1, 3)
             out_2 = F.pixel_shuffle(x_2, 3, "NHWC")
 
@@ -123,8 +125,10 @@ def test_static_graph_layer(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(
+                name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=[2, 4, 4, 9], dtype="float64")
             # init instance
             ps_1 = paddle.nn.PixelShuffle(3)
             ps_2 = paddle.nn.PixelShuffle(3, "NHWC")
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index d618875835ffd8..40b9be9ee4f9bd 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -224,7 +224,7 @@ class TestPool3D_Op(OpTest):
     def setUp(self):
         self.op_type = "pool3d"
         self.init_kernel_type()
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.init_test_case()
         self.padding_algorithm = "EXPLICIT"
         self.init_paddings()
@@ -277,9 +277,16 @@ def test_check_grad(self):
             return
         if self.has_cudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, set(['X']), 'Out')
+            if core.is_compiled_with_rocm():
+                self.check_grad_with_place(
+                    place, set(['X']), 'Out', max_relative_error=1e-2)
+            else:
+                self.check_grad_with_place(place, set(['X']), 'Out')
         elif self.pool_type != "max":
-            self.check_grad(set(['X']), 'Out')
+            if core.is_compiled_with_rocm():
+                self.check_grad(set(['X']), 'Out', max_relative_error=1e-2)
+            else:
+                self.check_grad(set(['X']), 'Out')
 
     def init_data_format(self):
         self.data_format = "NCDHW"
@@ -400,7 +407,10 @@ def test_check_output(self):
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
-                    self.check_output_with_place(place, atol=1e-3)
+                    if core.is_compiled_with_rocm():
+                        self.check_output_with_place(place, atol=1e-2)
+                    else:
+                        self.check_output_with_place(place, atol=1e-3)
 
     cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op")
     TestCUDNNFp16Case.__name__ = cls_name
diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py
index 15fd79542d608f..cdfcbb4e4e735d 100644
--- a/python/paddle/fluid/tests/unittests/test_prod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
@@ -55,7 +55,8 @@ def run_imperative(self):
         self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
         result0 = paddle.prod(input)
         result1 = paddle.prod(input, axis=1)
         result2 = paddle.prod(input, axis=-1)
@@ -114,7 +115,8 @@ def test_error(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
             x = paddle.fluid.data(name='x', shape=[2, 2, 4], dtype='float32')
-            bool_x = paddle.fluid.data(name='bool_x', shape=[2, 2, 4], dtype='bool')
+            bool_x = paddle.fluid.data(
+                name='bool_x', shape=[2, 2, 4], dtype='bool')
             # The argument x shoule be a Tensor
             self.assertRaises(TypeError, paddle.prod, [1])
 
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
new file mode 100644
index 00000000000000..89f8330fe5ba4b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.autograd import PyLayer
+
+
+class TestPyLayer(unittest.TestCase):
+    def test_simple_pylayer_multiple_output(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                y2 = func1(x2)
+                ctx.save_for_backward(y1, y2)
+                return y1, y2
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                y1, y2 = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                re2 = dy2 * (1 - paddle.square(y2))
+                return re1, re2
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(input1, input1, paddle.tanh, paddle.square)
+        z = z[0] + z[1]
+        z.mean().backward()
+
+        z2 = paddle.tanh(input2) + paddle.tanh(input2)
+        z2.mean().backward()
+
+        self.assertTrue(np.max(np.abs((input1.grad - input2.grad))) < 1e-10)
+
+    def test_simple_pylayer_single_output(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                ctx.save_for_backward(y1)
+                return y1
+
+            @staticmethod
+            def backward(ctx, dy1):
+                y1, = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                return re1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(x1=input1, func1=paddle.tanh)
+        z.mean().backward()
+        z2 = paddle.tanh(input2)
+        z2.mean().backward()
+
+        self.assertTrue(np.max(np.abs((input1.grad - input2.grad))) < 1e-10)
+
+    def test_pylayer_dtype(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x, dtype):
+                y = paddle.cast(x, dtype)
+                return y
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return dy1
+
+        dtypes = [
+            'bool', 'float16', 'float32', 'float64', 'uint8', 'int32', 'int64'
+        ]
+        for dtype in dtypes:
+            input1 = (paddle.randn([2, 3]))
+            input1.stop_gradient = False
+            self.assertTrue(input1.grad is None)
+
+            z = tanh.apply(input1, dtype)
+            z = paddle.cast(z, "float32")
+            z.sum().backward()
+            self.assertTrue(input1.grad is not None)
+
+    def test_pylayer_Exception_forward(self):
+        class Layer_None1(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return None
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_None1.apply(input1)
+
+        class Layer_None2(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return [None, None]
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_None2.apply(input1)
+
+        class Layer_one1(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return 1
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_one1.apply(input1)
+
+        class Layer_one2(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return [1, 2]
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_one2.apply(input1)
+
+        class Layer_no_fw(PyLayer):
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_no_fw.apply(input1)
+
+    def test_pylayer_nograd(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, func1, func2=paddle.square, xx=None):
+                ctx.func = func2
+                y1 = func1(x1)
+                return y1
+
+            @staticmethod
+            def backward(ctx, x1, y1, dy1):
+                re1 = dy1 * (1 - ctx.func(y1))
+                return re1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        z = tanh.apply(input1, paddle.tanh, paddle.square)
+        z.mean().backward()
+        self.assertTrue(z.grad is None)
+
+    def test_pylayer_Exception_bk(self):
+        class Layer_bk_none1(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return None
+
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input2.stop_gradient = False
+        z = Layer_bk_none1.apply(input2)
+
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z.sum().backward()
+
+        class Layer_bk_none2(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return None, dy1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_none2.apply(input1, input1)
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z.mean().backward()
+
+        class Layer_bk_one1(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x + x
+
+            @staticmethod
+            def backward(ctx, dy):
+                return 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_one1.apply(input1)
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z.mean().backward()
+
+        class Layer_bk_one2(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2, x * 5
+
+            @staticmethod
+            def backward(ctx, *args):
+                return 1, 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_one1.apply(input1)
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z.mean().backward()
+
+        class Layer_no_bk(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2, x * 5
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_no_bk.apply(input1)
+
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z = z[0] + z[1]
+                z.mean().backward()
+
+        class Layer_bk_match(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2, x * 5
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy2 * 2, dy1 * 2
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_match.apply(input1)
+        with self.assertRaises(ValueError):
+            with paddle.fluid.dygraph.guard():
+                z = z[0] + z[1]
+                z.mean().backward()
+
+    def test_pylayer_inplace(self):
+        class cus_tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x.mean()
+
+            @staticmethod
+            def backward(ctx, dy):
+                return dy
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float64") / (i + 1)
+            data.stop_gradient = False
+            data = paddle.nn.functional.relu(data)
+            z = paddle.tanh(data)
+            z = cus_tanh.apply(data)
+            z.backward()
+            self.assertTrue(data.grad is not None)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index 95ae1eecc66141..e71adae8d9b6eb 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -128,15 +128,18 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.selu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.selu, x_int32)
             # The scale must be greater than 1.0
-            x_fp32 = paddle.fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            x_fp32 = paddle.fluid.data(
+                name='x_fp32', shape=[12, 10], dtype='float32')
             self.assertRaises(ValueError, F.selu, x_fp32, -1.0)
             # The alpha must be no less than 0
             self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.selu(x_fp16)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 1239a2249cc43e..0885891cdbe027 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -48,18 +48,37 @@ def _get_answer(self):
 
 
 class TestSetValueApi(TestSetValueBase):
-    def test_api(self):
+    def _run_static(self):
+        paddle.enable_static()
         with paddle.static.program_guard(self.program):
             x = paddle.ones(shape=self.shape, dtype=self.dtype)
             self._call_setitem(x)
 
         exe = paddle.static.Executor(paddle.CPUPlace())
         out = exe.run(self.program, fetch_list=[x])
+        paddle.disable_static()
+        return out
+
+    def _run_dynamic(self):
+        paddle.disable_static()
+        x = paddle.ones(shape=self.shape, dtype=self.dtype)
+        self._call_setitem(x)
+        out = x.numpy()
+        paddle.enable_static()
+        return out
+
+    def test_api(self):
+        static_out = self._run_static()
+        dynamic_out = self._run_dynamic()
         self._get_answer()
+
+        error_msg = "\nIn {} mode: \nExpected res = \n{}, \n\nbut received : \n{}"
         self.assertTrue(
-            (self.data == out).all(),
-            msg="\nExpected res = \n{}, \n\nbut received : \n{}".format(
-                self.data, out))
+            (self.data == static_out).all(),
+            msg=error_msg.format("static", self.data, static_out))
+        self.assertTrue(
+            (self.data == dynamic_out).all(),
+            msg=error_msg.format("dynamic", self.data, dynamic_out))
 
 
 # 1. Test different type of item: int, Python slice, Paddle Tensor
@@ -106,6 +125,23 @@ def _get_answer(self):
         self.data[0:, 1:2, :] = self.value
 
 
+class TestSetValueItemSliceInWhile(TestSetValueApi):
+    def _call_setitem(self, x):
+        def cond(i, x):
+            return i < 1
+
+        def body(i, x):
+            x[i] = self.value
+            i = i + 1
+            return i, x
+
+        i = paddle.zeros(shape=(1, ), dtype='int32')
+        i, x = paddle.fluid.layers.while_loop(cond, body, [i, x])
+
+    def _get_answer(self):
+        self.data[0] = self.value
+
+
 # 1.2.2 step > 1
 class TestSetValueItemSliceStep(TestSetValueApi):
     def set_shape(self):
@@ -731,6 +767,7 @@ def _broadcast_mismatch(self):
             exe.run(program)
 
     def test_error(self):
+        paddle.enable_static()
         with paddle.static.program_guard(self.program):
             self._value_type_error()
             self._dtype_error()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
new file mode 100644
index 00000000000000..0717ec80f6a139
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -0,0 +1,212 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
+import paddle
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSGDOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = 'sgd'
+        self.dtype = np.uint16
+        self.conf()
+        w = np.random.random((self.h, self.w)).astype('float32')
+        w_bf16 = convert_float_to_uint16(w)
+        g = np.random.random((self.h, self.w)).astype('float32')
+        g_bf16 = convert_float_to_uint16(g)
+        lr = np.array([0.1]).astype('float32')
+        lr_bf16 = convert_float_to_uint16(lr)
+
+        self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16}
+        self.outputs = {'ParamOut': w - lr * g}
+
+    def conf(self):
+        self.h = 102
+        self.w = 105
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSGDOpCase8XBF16(TestSGDOpBF16):
+    def conf(self):
+        self.h = 10
+        self.w = 64
+
+
+class TestSparseSGDOpBF16(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        np.random.seed(12345)
+
+    def ref_optimize(self, params, grad_rows, grad_array, lr_value):
+        reference = np.copy(params)
+        for index, id in enumerate(grad_rows):
+            reference[id] = params[id] - lr_value * grad_array[index]
+        return reference
+
+    def check_output(self, actual_bf16, reference, atol=0, rtol=0.15e-2):
+        actual_fp32 = convert_uint16_to_float(actual_bf16)
+        np.testing.assert_allclose(actual_fp32, reference, atol=atol, rtol=rtol)
+
+    def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        # grad_array = np.random.random((len(rows), row_numel)).astype('float32')
+        grad_array = np.full((len(rows), row_numel), 2, np.float32)
+        np_array_bf16 = convert_float_to_uint16(grad_array)
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array_bf16, place)
+
+        return grad_tensor, grad_array
+
+    def create_dense_param_var(self, scope, place, height, width):
+        param_tensor = scope.var('Param').get_tensor()
+        # param_array = np.random.random((height, width)).astype('float32')
+        param_array = np.full((height, width), 5, np.float32)
+        param_array_bf16 = convert_float_to_uint16(param_array)
+        param_tensor.set(param_array_bf16, place)
+
+        return param_tensor, param_array
+
+    def create_sparse_param_var(self, scope, place, height, rows, row_numel):
+        param_selected_rows = scope.var('Param').get_selected_rows()
+        param_selected_rows.set_height(height)
+        param_selected_rows.set_rows(rows)
+        param_selected_rows.sync_index()
+        param_array = np.random.random((len(rows), row_numel)).astype('float32')
+        np_array_bf16 = convert_float_to_uint16(param_array)
+
+        param_tensor = param_selected_rows.get_tensor()
+        param_tensor.set(np_array_bf16, place)
+
+        return param_tensor, param_array
+
+    def create_dense_lr_var(self, scope, place):
+        lr_tensor = scope.var('LearningRate').get_tensor()
+        # lr_value = np.random.uniform()
+        lr_value = 2
+        lr_array = np.full((1), lr_value, np.float32)
+        lr_array_bf16 = convert_float_to_uint16(lr_array)
+        lr_tensor.set(lr_array_bf16, place)
+
+        return lr_tensor, lr_value
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradSGDOpBF16(TestSparseSGDOpBF16):
+    def setUp(self):
+        self.setup_params()
+
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 12
+
+    def test_sparse_grad_sgd(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        _, grad_array = self.create_sparse_grad_var(
+            scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
+        param_tensor, param_array = self.create_dense_param_var(
+            scope, place, self.grad_height, self.grad_row_numel)
+        _, lr_value = self.create_dense_lr_var(scope, place)
+
+        sgd_op = Operator(
+            'sgd',
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
+                                      lr_value)
+        output = np.array(param_tensor)
+        self.check_output(output, reference, atol=5e-3, rtol=1e-1)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 14
+        self.grad_rows = [1, 4, 12, 7, 8]
+        self.grad_row_numel = 16
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
+    def setUp(self):
+        self.setup_params()
+
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 12
+        self.param_rows = [a for a in range(self.grad_height)]
+
+    def test_sparse_param_grad_sgd(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        _, grad_array = self.create_sparse_grad_var(
+            scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
+        param_tensor, param_array = self.create_sparse_param_var(
+            scope, place, self.grad_height, self.param_rows,
+            self.grad_row_numel)
+        _, lr_value = self.create_dense_lr_var(scope, place)
+
+        sgd_op = Operator(
+            'sgd',
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
+                                      lr_value)
+        output = np.array(param_tensor)
+        self.check_output(output, reference, atol=5e-3, rtol=1e-1)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 14
+        self.grad_rows = [1, 4, 12, 7, 8]
+        self.grad_row_numel = 16
+        self.param_rows = [a for a in range(self.grad_height)]
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
index 85f9501e53f4ab..2ef04d9cbfa73f 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
@@ -42,8 +42,10 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(
+            name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         normalizer = None
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
deleted file mode 100644
index d8c57d964da706..00000000000000
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy as np
-from paddle.fluid.op import Operator
-
-
-class TestSpliteSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def test_check_grad(self):
-        for place in self.get_places():
-            self.check_grad_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        rows = [0, 5, 7, 4, 20]
-        height = 21
-        row_numel = 2
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(rows)
-        x.set_height(height)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 1] = 4.0
-        np_array[4, 1] = 8.0
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        height_sections = [5, 5, 5, 5, 3]
-
-        # initialize output variables [out0, out1]
-        outs_name = ["out%d" % i for i in range(len(height_sections))]
-        outs = [
-            scope.var(var_name).get_selected_rows() for var_name in outs_name
-        ]
-
-        # expected output selected rows
-        expected_out0_rows = [0, 4]
-        expected_out1_rows = [0, 2]
-        expected_out2_rows = []
-        expected_out4_rows = [0]
-
-        op = Operator(
-            "split_selected_rows",
-            X="X",
-            Out=outs_name,
-            height_sections=height_sections)
-
-        op.run(scope, place)
-
-        self.assertEqual(outs[0].rows(), expected_out0_rows)
-        self.assertEqual(outs[1].rows(), expected_out1_rows)
-        self.assertEqual(outs[2].rows(), expected_out2_rows)
-        self.assertEqual(outs[4].rows(), expected_out4_rows)
-
-        self.assertEqual(outs[0].height(), height_sections[0])
-        self.assertEqual(outs[4].height(), height_sections[4])
-
-        self.assertAlmostEqual(2.0, np.array(outs[0].get_tensor())[0, 0])
-        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
-        self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
-
-        self.assertEqual(outs[2].numel(), 0)
-        self.assertEqual(outs[3].numel(), 0)
-
-    def check_grad_with_place(self, place):
-        scope = core.Scope()
-        height = 10
-        row_numel = 2
-
-        # attr
-        height_sections = [5, 5]
-
-        # initialize input variable X
-        out0_grad = scope.var("out0@GRAD").get_selected_rows()
-        rows0 = [0, 5]
-        out0_grad.set_rows(rows0)
-        out0_grad.set_height(height)
-        out0_grad_tensor = out0_grad.get_tensor()
-        np_array = np.ones((len(rows0), row_numel)).astype("float32")
-        out0_grad_tensor.set(np_array, place)
-
-        out1_grad = scope.var("out1@GRAD").get_selected_rows()
-        rows1 = [2, 0]
-        out1_grad.set_rows(rows1)
-        out1_grad.set_height(height)
-        out1_grad_tensor = out1_grad.get_tensor()
-        np_array = np.ones((len(rows1), row_numel)).astype("float32")
-        out1_grad_tensor.set(np_array, place)
-
-        x_grad = scope.var("X@GRAD").get_selected_rows()
-
-        grad_op = Operator(
-            "sum",
-            X=["out0@GRAD", "out1@GRAD"],
-            Out="X@GRAD",
-            height_sections=height_sections)
-
-        grad_op.run(scope, place)
-
-        merged_rows = set(rows0 + rows1)
-        self.assertEqual(set(x_grad.rows()), set(rows0 + rows1))
-        self.assertEqual(x_grad.height(), height)
-
-        print(np.array(x_grad.get_tensor()))
-        self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
-        self.assertAlmostEqual(1.0, np.array(x_grad.get_tensor())[2, 1])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
index 08413d711be55b..c5dc98af5c8f6d 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
@@ -25,12 +25,17 @@
 import pickle
 import os
 
+# Python2.x no longer supports saving and loading large parameters.
+if six.PY2:
+    LARGE_PARAM = 2
+else:
+    LARGE_PARAM = 2**26
+
 
 class TestStaticSaveLoadLargeParameters(unittest.TestCase):
     def test_large_parameters_static_save(self):
         # enable static mode
         paddle.enable_static()
-        LARGE_PARAM = 2**26
         with new_program_scope():
             # create network
             x = paddle.static.data(
@@ -54,7 +59,11 @@ def test_large_parameters_static_save(self):
 
             path = os.path.join("test_static_save_load_large_param",
                                 "static_save")
-            paddle.fluid.save(prog, path)
+            if six.PY2:
+                protocol = 2
+            else:
+                protocol = 4
+            paddle.fluid.save(prog, path, pickle_protocol=protocol)
             # set var to zero
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -92,3 +101,7 @@ def test_large_parameters_static_save(self):
                                      .get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
new file mode 100644
index 00000000000000..50b00ab34fd096
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+
+class SimpleNet(nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(SimpleNet, self).__init__()
+        self.linear1 = nn.Linear(in_size, in_size)
+        self.linear2 = nn.Linear(in_size, out_size)
+
+    def forward(self, x, hook=None, register=False, remove=False):
+        ret1 = self.linear1(x)
+        if hook is not None:
+            if register:
+                h = ret1.register_hook(hook)
+                if remove:
+                    h.remove()
+        ret2 = self.linear2(ret1)
+        out = paddle.mean(ret2, axis=-1)
+        return ret1, out
+
+
+class TestTensorRegisterHook(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.in_size = 10
+        self.out_size = 10
+        self.batch_size = 4
+        self.devices = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            self.devices.append("gpu")
+
+    def test_hook_for_interior_var(self):
+        def run_double_hook_for_interior_var(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                w = x + y
+                w.stop_gradient = False
+                helper = w.register_hook(double_hook)
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # z.grad is not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                # w.grad is not changed by hook
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                # x.grad and y.grad are changed if run hook
+                self.assertTrue(
+                    np.array_equal(x.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
+                self.assertTrue(
+                    np.array_equal(y.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
+
+        def run_print_hook_for_interior_var(print_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                w = x + y
+                w.stop_gradient = False
+                helper = w.register_hook(print_hook)
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # all grads are not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                self.assertTrue(np.array_equal(x.grad, z.numpy()))
+                self.assertTrue(np.array_equal(y.grad, z.numpy()))
+
+        def double_hook(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        def print_hook(grad):
+            print(grad)
+
+        # register hook
+        run_double_hook_for_interior_var(double_hook)
+        # register hook and removed
+        run_double_hook_for_interior_var(double_hook, removed=True)
+
+        # register hook
+        run_double_hook_for_interior_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_interior_var(lambda grad: grad * 2, removed=True)
+
+        # register hook
+        run_print_hook_for_interior_var(print_hook)
+        # register hook and removed
+        run_print_hook_for_interior_var(print_hook, removed=True)
+
+    def test_hook_for_leaf_var(self):
+        def run_double_hook_for_leaf_var(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+                helper = y.register_hook(double_hook)
+
+                w = x + y
+                w.stop_gradient = False
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # z.grad, w.grad, x.grad is not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                self.assertTrue(np.array_equal(x.grad, z.numpy()))
+                # y.grad are changed if run hook
+                self.assertTrue(
+                    np.array_equal(y.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
+
+        # register hook
+        run_double_hook_for_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
+
+    def test_hook_for_accumulated_grad_interior_var(self):
+        def run_double_hook_for_accumulated_grad_interior_var(double_hook,
+                                                              removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                a = paddle.to_tensor([0., 1., 1., 2.])
+                b = paddle.to_tensor([0., 0., 1., 2.])
+                a.stop_gradient = False
+                b.stop_gradient = False
+
+                helper1 = a.register_hook(double_hook)
+
+                x = a + b
+                x.stop_gradient = False
+
+                helper2 = x.register_hook(double_hook)
+
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                y.stop_gradient = False
+                z.stop_gradient = False
+
+                o1 = x + y
+                o2 = x + z
+                o1.stop_gradient = False
+                o2.stop_gradient = False
+
+                o = o1.matmul(o2)
+
+                # remove hook before backward
+                if removed:
+                    helper1.remove()
+                    helper2.remove()
+
+                o.backward()
+
+                base_grad = np.array([5., 9., 13., 19.])
+                # x.grad is not changed
+                self.assertTrue(np.array_equal(x.grad, base_grad))
+                # b.grad is changed by x.hook
+                self.assertTrue(
+                    np.array_equal(b.grad, base_grad * 2
+                                   if not removed else base_grad))
+                # a.grad is changed by x.hook and a.hook
+                self.assertTrue(
+                    np.array_equal(a.grad, base_grad * 4
+                                   if not removed else base_grad))
+
+        # register hook
+        run_double_hook_for_accumulated_grad_interior_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_accumulated_grad_interior_var(
+            lambda grad: grad * 2, removed=True)
+
+    def test_hook_for_accumulated_grad_leaf_var(self):
+        def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
+                                                          removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 4.])
+                x.stop_gradient = False
+
+                helper = x.register_hook(double_hook)
+
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                y.stop_gradient = False
+                z.stop_gradient = False
+
+                o1 = x + y
+                o2 = x + z
+                o1.stop_gradient = False
+                o2.stop_gradient = False
+
+                o = o1.matmul(o2)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                base_grad = np.array([5., 9., 13., 19.])
+                # x.grad is changed by x.hook
+                self.assertTrue(
+                    np.array_equal(x.grad, base_grad * 2
+                                   if not removed else base_grad))
+
+        # register hook
+        run_double_hook_for_accumulated_grad_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_accumulated_grad_leaf_var(
+            lambda grad: grad * 2, removed=True)
+
+    def test_hook_in_model(self):
+        def run_double_hook_in_model(data,
+                                     label,
+                                     hook=None,
+                                     register=False,
+                                     remove=False):
+            for device in self.devices:
+                paddle.seed(self.seed)
+                paddle.set_device(device)
+
+                net = SimpleNet(self.in_size, self.out_size)
+                loss_fn = nn.MSELoss()
+
+                data = paddle.to_tensor(data)
+                label = paddle.to_tensor(label)
+
+                ret1, out = net(data, hook, register, remove)
+                loss = loss_fn(out, label)
+                loss.backward()
+
+                return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad
+
+        data = np.random.uniform(
+            size=[self.batch_size, self.in_size]).astype('float32')
+        label = np.random.uniform(size=[self.batch_size, 1]).astype('float32')
+
+        # get original value
+        ret1_grad, linear1_w_grad, linear1_b_grad = run_double_hook_in_model(
+            data, label)
+        # get value changed by hook
+        ret1_grad_hook, linear1_w_grad_hook, linear1_b_grad_hook = run_double_hook_in_model(
+            data, label, lambda grad: grad * 2, True)
+        # get value after removing hook
+        ret1_grad_rm, linear1_w_grad_rm, linear1_b_grad_rm = run_double_hook_in_model(
+            data, label, lambda grad: grad * 2, True, True)
+
+        # compare original value and with hook
+        self.assertTrue(np.array_equal(ret1_grad, ret1_grad_hook))
+        self.assertTrue(np.array_equal(linear1_w_grad * 2, linear1_w_grad_hook))
+        self.assertTrue(np.array_equal(linear1_b_grad * 2, linear1_b_grad_hook))
+
+        # compare original value and remove hook
+        self.assertTrue(np.array_equal(ret1_grad, ret1_grad_rm))
+        self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm))
+        self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm))
+
+    def test_multiple_hooks_for_interior_var(self):
+        def run_multiple_hooks_for_interior_var(device,
+                                                hooks,
+                                                remove1=False,
+                                                remove2=False,
+                                                remove3=False):
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([0., 1., 2., 3.])
+            y = paddle.to_tensor([4., 5., 6., 7.])
+            x.stop_gradient = False
+            y.stop_gradient = False
+
+            w = x + y
+            w.stop_gradient = False
+
+            helpers = []
+            for hook in hooks:
+                helper = w.register_hook(hook)
+                helpers.append(helper)
+
+            z = paddle.to_tensor([1., 2., 3., 4.])
+            z.stop_gradient = False
+
+            o = z.matmul(w)
+
+            if remove1:
+                helpers[0].remove()
+            if remove2:
+                helpers[1].remove()
+            if remove3:
+                helpers[2].remove()
+
+            o.backward()
+
+            return z.numpy(), w.grad, x.grad, y.grad
+
+        def double_hook(grad):
+            return grad * 2
+
+        hooks = [double_hook, double_hook, double_hook]
+
+        for device in self.devices:
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 8))
+            self.assertTrue(np.array_equal(y_grad, z * 8))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove1=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove2=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove3=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove1=True, remove2=True, remove3=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z))
+            self.assertTrue(np.array_equal(y_grad, z))
+
+    def test_hook_in_double_grad(self):
+        def double_print_hook(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        x = paddle.ones(shape=[1], dtype='float32')
+        x.stop_gradient = False
+
+        # hook only works in backward
+        # for forward var x, the x.grad generated in
+        # paddle.grad will not deal with by hook
+        x.register_hook(double_print_hook)
+
+        y = x * x
+
+        # Since y = x * x, dx = 2 * x
+        dx = paddle.grad(
+            outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0]
+
+        z = y + dx
+        self.assertTrue(x.grad is None)
+
+        # If create_graph = True, the gradient of dx
+        # would be backpropagated. Therefore,
+        # z = x * x + dx = x * x + 2 * x, and
+        # x.gradient() = 2 * x + 2 = 4.0
+        # after changed by hook: 8.0
+
+        z.backward()
+        self.assertTrue(np.array_equal(x.grad, np.array([8.])))
+
+    def test_remove_one_hook_multiple_times(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+            x.stop_gradient = False
+
+            h = x.register_hook(lambda grad: grad * 2)
+            self.assertTrue(h.remove())
+            self.assertFalse(h.remove())
+
+    def test_register_hook_for_stop_gradient_var(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+
+            with self.assertRaises(RuntimeError):
+                x.register_hook(lambda grad: grad * 2)
+
+
+HOOK_INIT_VALUE = 10
+HOOK_IS_CALLED = False
+
+
+def global_void_hook():
+    global HOOK_INIT_VALUE
+    global HOOK_IS_CALLED
+    HOOK_INIT_VALUE *= 2
+    HOOK_IS_CALLED = True
+
+
+class TestTensorRegisterBackwardHook(unittest.TestCase):
+    def setUp(self):
+        self.devices = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            self.devices.append("gpu")
+
+    def test_register_backward_hook(self):
+        global HOOK_INIT_VALUE
+        global HOOK_IS_CALLED
+        for device in self.devices:
+            x = paddle.to_tensor(5., stop_gradient=False)
+            x._register_backward_hook(global_void_hook)
+            for i in range(5):
+                y = paddle.pow(x, 4.0)
+                y.backward()
+
+            self.assertEqual(HOOK_INIT_VALUE, 320)
+            self.assertTrue(HOOK_IS_CALLED)
+
+            # reset initial value
+            HOOK_INIT_VALUE = 10
+            HOOK_IS_CALLED = False
+
+    def test_register_backward_hook_for_interior_var(self):
+        x = paddle.to_tensor(5., stop_gradient=False)
+        y = paddle.pow(x, 4.0)
+
+        with self.assertRaises(ValueError):
+            y._register_backward_hook(global_void_hook)
+
+    def test_register_backward_hook_for_var_without_gradient(self):
+        x = paddle.to_tensor(5.)
+        y = paddle.pow(x, 4.0)
+
+        with self.assertRaises(ValueError):
+            x._register_backward_hook(global_void_hook)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
index 38543fecac85ef..cb5186468890d8 100644
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -18,6 +18,7 @@
 import six
 import unittest
 import paddle.nn as nn
+import os
 
 
 class SimpleFCLayer(nn.Layer):
@@ -115,36 +116,41 @@ def test_save_inference_model_err(self):
             dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                 self.layer, [in_x])
 
-            dirname = './traced_layer_err_msg'
+            path = './traced_layer_err_msg'
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model([0])
             self.assertEqual(
-                "The type of 'dirname' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ".
+                "The type of 'path' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [0], [None])
+                traced_layer.save_inference_model(path, [0], [None])
             self.assertEqual(
                 "The type of 'each element of fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [0], False)
+                traced_layer.save_inference_model(path, [0], False)
             self.assertEqual(
                 "The type of 'fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".
                 format(self.type_str, self.type_str, self.type_str),
                 str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [None], [0])
+                traced_layer.save_inference_model(path, [None], [0])
             self.assertEqual(
                 "The type of 'each element of feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, True, [0])
+                traced_layer.save_inference_model(path, True, [0])
             self.assertEqual(
                 "The type of 'feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".
                 format(self.type_str, self.type_str, self.type_str),
                 str(e.exception))
+            with self.assertRaises(ValueError) as e:
+                traced_layer.save_inference_model("")
+            self.assertEqual(
+                "The input path MUST be format of dirname/file_prefix [dirname\\file_prefix in Windows system], "
+                "but received file_prefix is empty string.", str(e.exception))
 
-            traced_layer.save_inference_model(dirname)
+            traced_layer.save_inference_model(path)
 
     def _train_simple_net(self):
         layer = None
@@ -174,5 +180,25 @@ def test_linear_net_with_none(self):
                                                                         [in_x])
 
 
+class TestTracedLayerSaveInferenceModel(unittest.TestCase):
+    """test save_inference_model will automaticlly create non-exist dir"""
+
+    def setUp(self):
+        self.save_path = "./nonexist_dir/fc"
+        import shutil
+        if os.path.exists(os.path.dirname(self.save_path)):
+            shutil.rmtree(os.path.dirname(self.save_path))
+
+    def test_mkdir_when_input_path_non_exist(self):
+        fc_layer = SimpleFCLayer(3, 4, 2)
+        input_var = paddle.to_tensor(np.random.random([4, 3]).astype('float32'))
+        with fluid.dygraph.guard():
+            dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
+                fc_layer, inputs=[input_var])
+            self.assertFalse(os.path.exists(os.path.dirname(self.save_path)))
+            traced_layer.save_inference_model(self.save_path)
+            self.assertTrue(os.path.exists(os.path.dirname(self.save_path)))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpiler_ops.py b/python/paddle/fluid/tests/unittests/test_transpiler_ops.py
deleted file mode 100644
index 9512ae495d8b61..00000000000000
--- a/python/paddle/fluid/tests/unittests/test_transpiler_ops.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import traceback
-import math
-import collections
-
-import six
-import unittest
-import numpy as np
-
-import gc
-
-gc.set_debug(gc.DEBUG_COLLECTABLE)
-
-import paddle.fluid as fluid
-from test_dist_transpiler import TranspilerTest
-
-
-class TestFakeInit(TranspilerTest):
-    def net_conf(self):
-        dict_size, embedding_size, neg_num = 10000, 8, 5
-
-        input_word = fluid.layers.data(
-            name="input_word", shape=[1], dtype='int64', lod_level=1)
-        true_word = fluid.layers.data(
-            name='true_label', shape=[1], dtype='int64', lod_level=1)
-        neg_word = fluid.layers.data(
-            name="neg_label", shape=[1], dtype='int64', lod_level=1)
-        inputs = [input_word, true_word, neg_word]
-
-        init_width = 0.5 / embedding_size
-        input_emb = fluid.layers.embedding(
-            input=inputs[0],
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb',
-                initializer=fluid.initializer.Uniform(-init_width, init_width)))
-
-        true_emb_w = fluid.layers.embedding(
-            input=inputs[1],
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb_w',
-                initializer=fluid.initializer.Constant(value=0.0)))
-
-        true_emb_b = fluid.layers.embedding(
-            input=inputs[1],
-            is_sparse=True,
-            size=[dict_size, 1],
-            param_attr=fluid.ParamAttr(
-                name='emb_b',
-                initializer=fluid.initializer.Constant(value=0.0)))
-
-        neg_word_reshape = fluid.layers.reshape(inputs[2], shape=[-1, 1])
-        neg_word_reshape.stop_gradient = True
-
-        neg_emb_w = fluid.layers.embedding(
-            input=neg_word_reshape,
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb_w', learning_rate=1.0))
-
-        neg_emb_w_re = fluid.layers.reshape(
-            neg_emb_w, shape=[-1, neg_num, embedding_size])
-
-        neg_emb_b = fluid.layers.embedding(
-            input=neg_word_reshape,
-            is_sparse=True,
-            size=[dict_size, 1],
-            param_attr=fluid.ParamAttr(
-                name='emb_b', learning_rate=1.0))
-
-        neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num])
-
-        true_logits = fluid.layers.elementwise_add(
-            fluid.layers.reduce_sum(
-                fluid.layers.elementwise_mul(input_emb, true_emb_w),
-                dim=1,
-                keep_dim=True),
-            true_emb_b)
-
-        input_emb_re = fluid.layers.reshape(
-            input_emb, shape=[-1, 1, embedding_size])
-
-        neg_matmul = fluid.layers.matmul(
-            input_emb_re, neg_emb_w_re, transpose_y=True)
-        neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num])
-        neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
-        # nce loss
-        label_ones = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, 1], value=1.0, dtype='float32')
-        label_zeros = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
-
-        true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
-                                                                   label_ones)
-        neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits,
-                                                                  label_zeros)
-        cost = fluid.layers.elementwise_add(
-            fluid.layers.reduce_sum(
-                true_xent, dim=1),
-            fluid.layers.reduce_sum(
-                neg_xent, dim=1))
-        avg_cost = fluid.layers.reduce_mean(cost)
-
-        sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=1.0,
-                decay_steps=2100,
-                decay_rate=0.1,
-                staircase=True))
-        sgd_optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        trainer, startup = self.get_trainer()
-
-        fake_init_ops = []
-        for op in startup.global_block().ops:
-            if op.type == "fake_init":
-                fake_init_ops.append(op)
-
-        self.assertEqual(len(fake_init_ops), 3)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index f72df8cbe46409..59b4afdf8b02d2 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -23,6 +23,7 @@
 
 paddle.enable_static()
 
+
 class TestTransposeOp(OpTest):
     def setUp(self):
         self.init_op_type()
@@ -151,6 +152,7 @@ def test_each_elem_value_check():
 
             self.assertRaises(ValueError, test_each_elem_value_check)
 
+
 class TestTransposeApi(unittest.TestCase):
     def test_static_out(self):
         paddle.enable_static()
@@ -161,10 +163,11 @@ def test_static_out(self):
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             x_np = np.random.random([2, 3, 4]).astype("float32")
-            result1, result2 = exe.run(feed={"x": x_np}, fetch_list=[x_trans1, x_trans2])
+            result1, result2 = exe.run(feed={"x": x_np},
+                                       fetch_list=[x_trans1, x_trans2])
             expected_result1 = np.transpose(x_np, [1, 0, 2])
             expected_result2 = np.transpose(x_np, (2, 1, 0))
-            
+
             np.testing.assert_array_equal(result1, expected_result1)
             np.testing.assert_array_equal(result2, expected_result2)
 
@@ -185,6 +188,7 @@ def test_dygraph_out(self):
         # dygraph test
         paddle.enable_static()
 
+
 class TestTAPI(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index b0c9dda7a30987..1fea1935473a75 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -76,6 +76,11 @@ def _test_place(place):
                     y = x.cuda(blocking=True)
                     self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
 
+                # support 'dtype' is core.VarType
+                x = paddle.rand((2, 2))
+                y = paddle.to_tensor([2, 2], dtype=x.dtype)
+                self.assertEqual(y.dtype, core.VarDesc.VarType.FP32)
+
                 # set_default_dtype take effect on complex
                 x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 8d5ab0a5be757a..690ac46e563ef0 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -190,7 +190,6 @@ def test_fake_interface_only_api(self):
         with fluid.dygraph.guard():
             self.assertRaises(AssertionError, var.detach)
             self.assertRaises(AssertionError, var.numpy)
-            self.assertRaises(AssertionError, var.set_value, None)
             self.assertRaises(AssertionError, var.backward)
             self.assertRaises(AssertionError, var.gradient)
             self.assertRaises(AssertionError, var.clear_gradient)
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 6310a76d8d000a..53f3b3cf53d765 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -20,6 +20,7 @@
 from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 import paddle
 import paddle.nn.functional as F
@@ -240,8 +241,18 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(
-            ["Logits"], "Loss", max_relative_error=0.007, check_dygraph=False)
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.009,
+                check_dygraph=False)
+        else:
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.007,
+                check_dygraph=False)
 
 
 class TestWarpCTCOpCase1(TestWarpCTCOp):
@@ -335,8 +346,18 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(
-            ["Logits"], "Loss", max_relative_error=0.007, check_dygraph=False)
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.009,
+                check_dygraph=False)
+        else:
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.007,
+                check_dygraph=False)
 
 
 class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
index cb64cb90e8c2c7..f1ba8828f2b335 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -51,10 +51,10 @@ class TestCastOp2(op_test.OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
-        self.outputs = {'Out': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
         self.attrs = {
             'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP32)
+            'out_dtype': int(core.VarDesc.VarType.FP16)
         }
         self.op_type = 'cast'
 
@@ -68,10 +68,10 @@ def test_check_output(self):
 class TestCastOp3(op_test.OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float32')}
+        self.inputs = {'X': ipt.astype('float16')}
         self.outputs = {'Out': ipt.astype('float32')}
         self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'in_dtype': int(core.VarDesc.VarType.FP16),
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.op_type = 'cast'
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
index fa0feb02f4378e..54dc46cd0ec3ee 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
@@ -27,8 +27,12 @@
 paddle.enable_static()
 
 
-def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y,
+                               batch_size):
     BATCH_SIZE = 2
+    if batch_size != None:
+        BATCH_SIZE = batch_size
+
     M = 3
     N = 4
     K = 5
@@ -58,6 +62,13 @@ def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
             shape_Y = [K, N]
     if dim_Y == 3:
         shape_Y = [BATCH_SIZE] + shape_Y
+
+    if dim_Y == 3 and dim_X == 2:
+        if transpose_X == False:
+            shape_X[1] = shape_X[1] * BATCH_SIZE
+        else:
+            shape_X[0] = shape_X[0] * BATCH_SIZE
+
     return shape_X, shape_Y
 
 
@@ -77,11 +88,19 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
     if transpose_Y:
         if Y.ndim == 1:
             Y = Y.reshape((1, Y.size))
+        elif Y.ndim == 2:
+            Y = Y.T
         else:
             dim = [i for i in range(len(Y.shape))]
             dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
             Y = np.transpose(Y, tuple(dim))
 
+    if X.ndim == 3 and Y.ndim == 2:
+        x_dims = X.shape
+        X = X.reshape((x_dims[0] * x_dims[1], x_dims[2]))
+    if Y.ndim == 3 and X.ndim == 2:
+        y_dims = Y.shape
+        Y = Y.reshape((y_dims[0] * y_dims[1], y_dims[2]))
     Out = np.matmul(X, Y)
     if not Out.shape:
         # We do not support 0-dimensional Tensors (scalars). So where
@@ -203,11 +222,11 @@ def test_negative_dims_program(obj):
 
 
 # Generate program api cases for all negative possibilities
-def api_test(dim_x, dim_y, trans_x, trans_y):
+def api_test(dim_x, dim_y, trans_x, trans_y, batch_size):
     test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
         dim_x, dim_y, trans_x, trans_y))
     shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
-                                                  trans_y)
+                                                  trans_y, batch_size)
     globals()[test_name] = type(test_name, (unittest.TestCase, ), {
         'shape_X': shape_x,
         'shape_Y': shape_y,
@@ -218,29 +237,35 @@ def api_test(dim_x, dim_y, trans_x, trans_y):
 
 
 # Generate operators cases for all possibilities
-def inject_test(dim_x, dim_y, trans_x, trans_y):
-    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-        dim_x, dim_y, trans_x, trans_y))
+def inject_test(dim_x, dim_y, trans_x, trans_y, batch_size):
+    test_name = (
+        'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.format(
+            dim_x, dim_y, trans_x, trans_y, batch))
     shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
-                                                  trans_y)
+                                                  trans_y, batch_size)
     globals()[test_name] = type(test_name, (Generator, XPUOpTest), {
         'shape_X': shape_x,
         'shape_Y': shape_y,
         'transpose_X': trans_x,
         'transpose_Y': trans_y,
+        'op_type': "matmul"
     })
 
 
-for dim_X in (1, 2, 3):
-    for dim_Y in (1, 2, 3):
-        transose_x = False
-        transose_y = False
-        if dim_X == 3 and dim_Y == 3:
-            inject_test(dim_X, dim_Y, transose_x, transose_y)
-            api_test(dim_X, dim_Y, transose_x, transose_y)
+xpu_support_dims_list = [[1, 1], [2, 2], [3, 3]]
+batch_size = [2, 4, 5, 10, 50, 100, 300]
+for dims in xpu_support_dims_list:
+    dim_X = dims[0]
+    dim_Y = dims[1]
+    for transose_x in (False, True):
+        for transose_y in (False, True):
+            for batch in batch_size:
+                inject_test(dim_X, dim_Y, transose_x, transose_y, batch)
+            # xpu not support all negative possibilities
+            # api_test(dim_X, dim_Y, False, False, 10)
 
 
-# Test case n-dim
+            # Test case n-dim
 def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     M = 2
     N = 4
@@ -261,7 +286,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     return shape_X, shape_Y
 
 
-# # Test case n-dim
+# Test case n-dim
 for dim in [4]:
     for transpose_X in [False, True]:
         for transpose_Y in [False, True]:
@@ -275,6 +300,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y):
                 'shape_Y': shape_Y,
                 'transpose_X': transpose_X,
                 'transpose_Y': transpose_Y,
+                'op_type': "matmul"
             })
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 531e9488d602de..435026220c2b59 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -45,7 +45,6 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
             dim = [i for i in range(len(Y.shape))]
             dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
             Y = np.transpose(Y, tuple(dim))
-
     Out = np.matmul(X, Y)
     if not Out.shape:
         # We do not support 0-dimensional Tensors (scalars). So where
@@ -98,16 +97,16 @@ def test_check_grad(self):
         self.check_grad_with_place(place, ['X', 'Y'], 'Out')
 
 
-# class TestMatMuklOp2(TestMatMulV2Op):
-#     """
-#     case 2
-#     """
+class TestMatMuklOp2(TestMatMulV2Op):
+    """
+    case 2
+    """
 
-#     def config(self):
-#         self.x_shape = (100, )
-#         self.y_shape = (1, 3, 2, 100)
-#         self.trans_x = False
-#         self.trans_y = True
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (100, 3)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp3(TestMatMulV2Op):
@@ -122,16 +121,16 @@ def config(self):
         self.trans_y = False
 
 
-# class TestMatMuklOp4(TestMatMulV2Op):
-#     """
-#     case 4
-#     """
+class TestMatMuklOp4(TestMatMulV2Op):
+    """
+    case 4
+    """
 
-#     def config(self):
-#         self.x_shape = (100, )
-#         self.y_shape = (1, 2, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (1, 1, 100, 1)
+        self.y_shape = (1, 100)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp5(TestMatMulV2Op):
@@ -146,27 +145,28 @@ def config(self):
         self.trans_y = False
 
 
-# class TestMatMuklOp6(TestMatMulV2Op):
-#     """
-#     case 6
-#     """
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
 
-#     def config(self):
-#         self.x_shape = (1, 2, 102, 1)
-#         self.y_shape = (102, )
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (1, 2, 102, 10)
+        self.y_shape = (2, 10, 111)
+        self.trans_x = False
+        self.trans_y = False
 
-# class TestMatMuklOp7(TestMatMulV2Op):
-#     """
-#     case 7
-#     """
 
-#     def config(self):
-#         self.x_shape = (1, 2, 1, 100)
-#         self.y_shape = (100, )
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (2, 100, 12)
+        self.trans_x = True
+        self.trans_y = False
 
 
 class TestMatMuklOp8(TestMatMulV2Op):
@@ -181,49 +181,52 @@ def config(self):
         self.trans_y = False
 
 
-# class TestMatMuklOp9(TestMatMulV2Op):
-#     """
-#     case 9
-#     """
+class TestMatMuklOp9(TestMatMulV2Op):
+    """
+    case 9
+    """
 
-#     def config(self):
-#         self.x_shape = (1, 1, 1, 100)
-#         self.y_shape = (2, 1, 2, 100)
-#         self.trans_x = False
-#         self.trans_y = True
+    def config(self):
+        self.x_shape = (100, 20, 100)
+        self.y_shape = (100, 100, 100)
+        self.trans_x = False
+        self.trans_y = True
 
-# class TestMatMuklOp10(TestMatMulV2Op):
-#     """
-#     case 10
-#     """
 
-#     def config(self):
-#         self.x_shape = (1, 1, 25, 4)
-#         self.y_shape = (1, 2, 4, 25)
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp10(TestMatMulV2Op):
+    """
+    case 10
+    """
 
-# class TestMatMuklOp11(TestMatMulV2Op):
-#     """
-#     case 11
-#     """
+    def config(self):
+        self.x_shape = (100, 20, 100)
+        self.y_shape = (100, 20, 100)
+        self.trans_x = True
+        self.trans_y = False
 
-#     def config(self):
-#         self.x_shape = (2, 1, 2, 100)
-#         self.y_shape = (1, 1, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
 
-# class TestMatMuklOp12(TestMatMulV2Op):
-#     """
-#     case 12
-#     """
+class TestMatMuklOp11(TestMatMulV2Op):
+    """
+    case 11
+    """
 
-#     def config(self):
-#         self.x_shape = (2, 1, 4, 25)
-#         self.y_shape = (1, 1, 4, 25)
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (2, 20, 100)
+        self.y_shape = (100, 30)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp12(TestMatMulV2Op):
+    """
+    case 12
+    """
+
+    def config(self):
+        self.x_shape = (1, 20, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp13(TestMatMulV2Op):
@@ -238,38 +241,40 @@ def config(self):
         self.trans_y = False
 
 
-# class TestMatMuklOp14(TestMatMulV2Op):
-#     """
-#     case 14_1
-#     """
+class TestMatMuklOp14(TestMatMulV2Op):
+    """
+    case 14_1
+    """
 
-#     def config(self):
-#         self.x_shape = (3, 1, 6, 6)
-#         self.y_shape = (1, 2, 6, 9)
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (100, 2, 100, 10)
+        self.y_shape = (100, 2, 10, 90)
+        self.trans_x = False
+        self.trans_y = False
 
-# class TestMatMuklOp15(TestMatMulV2Op):
-#     """
-#     case 14_2
-#     """
 
-#     def config(self):
-#         self.x_shape = (3, 1, 6, 6)
-#         self.y_shape = (1, 2, 6, 9)
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp15(TestMatMulV2Op):
+    """
+    case 14_2
+    """
 
-# class TestMatMuklOp16(TestMatMulV2Op):
-#     """
-#     case 16 : to check the gradient for special case
-#     """
+    def config(self):
+        self.x_shape = (100, 2, 100, 10)
+        self.y_shape = (100, 2, 100, 10)
+        self.trans_x = False
+        self.trans_y = True
 
-#     def config(self):
-#         self.x_shape = (100)
-#         self.y_shape = (1, 2, 2, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
+
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the big data
+    """
+
+    def config(self):
+        self.x_shape = (1000, 2, 100, 100)
+        self.y_shape = (1000, 2, 100, 900)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp17(TestMatMulV2Op):
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index c61141bcd322cc..00dea8d1251f4b 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -49,8 +49,8 @@ def _create_trainer(self, opt_info=None):
             device_worker = Hogwild()
             trainer._set_device_worker(device_worker)
         else:
-            trainer_class = opt_info["trainer"]
-            device_worker_class = opt_info["device_worker"]
+            trainer_class = opt_info.get("trainer", "MultiTrainer")
+            device_worker_class = opt_info.get("device_worker", "Hogwild")
             trainer = globals()[trainer_class]()
             device_worker = globals()[device_worker_class]()
 
diff --git a/python/paddle/fluid/transpiler/ascend_transpiler.py b/python/paddle/fluid/transpiler/ascend_transpiler.py
new file mode 100644
index 00000000000000..5593c91b5bc646
--- /dev/null
+++ b/python/paddle/fluid/transpiler/ascend_transpiler.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import collective
+from .. import core
+OpRole = core.op_proto_and_checker_maker.OpRole
+from paddle.distributed import fleet
+
+
+class AscendTranspiler(collective.Collective):
+    def __init__(self, startup_program, main_program):
+        self.nrings = 1
+        super(AscendTranspiler, self).__init__(self.nrings)
+        self._startup_program = startup_program
+        self._main_program = main_program
+
+    def _insert_allreduce_ops(self):
+        block = self._main_program.global_block()
+        ring_id = -1
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if self._is_backward_op(op) and \
+                    self.op_role_var_key in op.attr_names:
+                op_role_var = op.all_attrs()[self.op_role_var_key]
+
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    if param.is_distributed:
+                        continue
+
+                    # As we search ops reversedly, we should insert c_allreduce_sum
+                    # op in the same way to keep the ring_id alternate
+                    ring_id = (ring_id + 1) % self.nrings
+                    block._insert_op(
+                        offset + 1,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            self.op_role_key: OpRole.Backward
+                        })
+                    block._insert_op(
+                        offset + 2,
+                        type='scale',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'scale': 1.0 / fleet.worker_num(),
+                            self.op_role_key: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+    def transpile(self):
+        self._insert_allreduce_ops()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f2b6888d7a753d..5a616d81659b23 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: import framework api under this directory 
+# TODO: import framework api under this directory
 __all__ = [
     'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace',
-    'get_default_dtype', 'set_default_dtype'
+    'NPUPlace', 'get_default_dtype', 'set_default_dtype'
 ]
 
 __all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
@@ -31,6 +31,7 @@
 from ..fluid.core import CPUPlace  #DEFINE_ALIAS
 from ..fluid.core import CUDAPlace  #DEFINE_ALIAS
 from ..fluid.core import CUDAPinnedPlace  #DEFINE_ALIAS
+from ..fluid.core import NPUPlace  #DEFINE_ALIAS
 from ..fluid.core import VarBase  #DEFINE_ALIAS
 
 from paddle.fluid import core  #DEFINE_ALIAS
diff --git a/python/paddle/framework/dtype.py b/python/paddle/framework/dtype.py
new file mode 100644
index 00000000000000..3eeaa6e74ecebd
--- /dev/null
+++ b/python/paddle/framework/dtype.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "dtype", "uint8", "int8", "int16", "int32", "int64", "bfloat16", "float16",
+    "float32", "float64", "complex64", "complex128", "bool"
+]
+
+from ..fluid.core import VarDesc
+
+dtype = VarDesc.VarType
+dtype.__qualname__ = "dtype"
+dtype.__module__ = "paddle"
+
+uint8 = VarDesc.VarType.UINT8
+int8 = VarDesc.VarType.INT8
+int16 = VarDesc.VarType.INT16
+int32 = VarDesc.VarType.INT32
+int64 = VarDesc.VarType.INT64
+
+float32 = VarDesc.VarType.FP32
+float64 = VarDesc.VarType.FP64
+float16 = VarDesc.VarType.FP16
+bfloat16 = VarDesc.VarType.BF16
+
+complex64 = VarDesc.VarType.COMPLEX64
+complex128 = VarDesc.VarType.COMPLEX128
+
+bool = VarDesc.VarType.BOOL
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 3d93bed32ecc4d..3b953efab71c47 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -22,13 +22,18 @@
 import sys
 import numpy as np
 
+if not six.PY2:
+    import copyreg
+
 import paddle
 
 # deprecated module import
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer
+from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
+from paddle.fluid.io import _legacy_save as _legacy_static_save
+
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
@@ -181,7 +186,9 @@ def _build_load_path_and_config(path, config):
 
 
 def _parse_load_config(configs):
-    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+    supported_configs = [
+        'model_filename', 'params_filename', 'keep_name_table', 'return_numpy'
+    ]
 
     # input check
     for key in configs:
@@ -195,16 +202,158 @@ def _parse_load_config(configs):
     inner_config.model_filename = configs.get('model_filename', None)
     inner_config.params_filename = configs.get('params_filename', None)
     inner_config.keep_name_table = configs.get('keep_name_table', None)
+    inner_config.return_numpy = configs.get('return_numpy', False)
 
     return inner_config
 
 
-def save(obj, path, pickle_protocol=2):
+def _parse_save_config(configs):
+    supported_configs = ['use_binary_format', 'pickle_protocol']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.save` is not supported."
+                % key)
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.use_binary_format = configs.get('use_binary_format', False)
+    inner_config.pickle_protocol = configs.get('pickle_protocol', None)
+
+    return inner_config
+
+
+def _pickle_save(obj, f, protocol):
+    # TODO(weixin):add support for BytesIO.
+    if not isinstance(protocol, int):
+        raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
+            type(protocol)))
+
+    if protocol < 2 or protocol > 4:
+        raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
+                         format(protocol))
+
+    if not isinstance(obj, (core.LoDTensor, core.VarBase)):
+        raise NotImplementedError(
+            "Support 'paddle.Tensor' or 'paddle.core.LoDTensor', but received {}.".
+            format(type(obj)))
+
+    def reudce_varbase(self):
+        data = self.numpy()
+        name = self.name
+
+        return (tuple, ((name, data), ))
+
+    def reduce_LoDTensor(self):
+        data = np.array(self)
+
+        return (eval, ('data', {'data': data}))
+
+    def add_dispatch_table():
+        # This is not a good method, because the pickle module has been modified.
+        pickle.dispatch_table[core.VarBase] = reudce_varbase
+        pickle.dispatch_table[ParamBase] = reudce_varbase
+        pickle.dispatch_table[core.LoDTensor] = reduce_LoDTensor
+
+    def pop_dispatch_table():
+        pickle.dispatch_table.pop(core.VarBase)
+        pickle.dispatch_table.pop(core.LoDTensor)
+        pickle.dispatch_table.pop(ParamBase)
+
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        add_dispatch_table()
+        pickle_bytes = pickle.dumps(obj)
+        pop_dispatch_table()
+
+        max_bytes = 2**30
+        for i in range(0, len(pickle_bytes), max_bytes):
+            f.write(pickle_bytes[i:i + max_bytes])
+    else:
+        if six.PY2:
+            add_dispatch_table()
+            pickle_bytes = pickle.dump(obj, f, protocol)
+            pop_dispatch_table()
+        else:
+            pickler = pickle.Pickler(f, protocol)
+            pickler.dispatch_table = copyreg.dispatch_table.copy()
+
+            pickler.dispatch_table[core.VarBase] = reudce_varbase
+            pickler.dispatch_table[core.LoDTensor] = reduce_LoDTensor
+            pickler.dispatch_table[ParamBase] = reudce_varbase
+
+            pickler.dump(obj)
+
+
+def _use_legacy(obj):
+    # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
+    if not isinstance(obj, dict):
+        return False
+    return True
+
+
+def _transformed_from_varbase(obj):
+    # In paddle2.1 version, VarBase is saved as tuple(tensor.name, tensor.numpy()).
+    # When executing paddle.load, use this function to determine whether to restore to VarBase/LoDTensor.
+    if isinstance(obj, tuple) and len(obj) == 2:
+        if six.PY2:
+            name_types = (str, unicode)
+        else:
+            name_types = str
+        if isinstance(obj[0], name_types) and isinstance(obj[1], np.ndarray):
+            return True
+    return False
+
+
+def _transformed_from_lodtensor(obj):
+    # In paddle2.1 version, LoDTensor is saved as np.array(tensor).
+    # When executing paddle.load, use this function to determine whether to restore to VarBase/LoDTensor.
+    if isinstance(obj, np.ndarray):
+        return True
+    return False
+
+
+def _to_LodTensor(ndarray):
+    if not isinstance(ndarray, np.ndarray):
+        raise TypeError(
+            'Type of `ndarray` should be numpy.ndarray, but received {}.'.
+            format(type(ndarray)))
+    t = core.LoDTensor()
+    place = _current_expected_place()
+    t.set(ndarray, place)
+    return t
+
+
+def _tuple_to_tensor(obj, return_numpy):
+    if return_numpy:
+        return obj[1]
+    if in_dygraph_mode():
+        t = paddle.to_tensor(obj[1])
+        # This function does modify the name of return value.
+        # Loading the same variable multiple times may cause the same name.
+        t.name = obj[0]
+        return t
+    else:
+        return _to_LodTensor(obj[1])
+
+
+def _ndarray_to_tensor(obj, return_numpy):
+    if return_numpy:
+        return obj
+    if in_dygraph_mode():
+        return paddle.to_tensor(obj)
+    else:
+        return _to_LodTensor(obj)
+
+
+def save(obj, path, protocol=2, **configs):
     '''
     Save an object to the specified path.
     
     .. note::
-        Now only supports save ``state_dict`` of Layer or Optimizer.
+        Now supports saving ``state_dict`` of Layer or Optimizer, Tensor.
 
     .. note::
         Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
@@ -219,8 +368,12 @@ def save(obj, path, pickle_protocol=2):
         obj(Object) : The object to be saved.
         path(str) : The path of the object to be saved. 
           If saved in the current directory, the input path string will be used as the file name. 
-        pickle_protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
+        protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 2
+        **configs(dict, optional): optional keyword arguments. The following options are currently supported:
+          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. 
+          If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format.
+          Default: False
 
     Returns:
         None
@@ -228,20 +381,91 @@ def save(obj, path, pickle_protocol=2):
     Examples:
         .. code-block:: python
 
+            # example 1: dynamic graph
             import paddle
-
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
+
+            # save state_dict of emb
             paddle.save(layer_state_dict, "emb.pdparams")
-            scheduler = paddle.optimizer.lr.NoamDecay(	
+
+            scheduler = paddle.optimizer.lr.NoamDecay(
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
                 parameters=emb.parameters())
             opt_state_dict = adam.state_dict()
+
+            # save state_dict of optimizer
             paddle.save(opt_state_dict, "adam.pdopt")
+            # save weight of emb
+            paddle.save(emb.weight, "emb.weight.pdtensor")
+
+            # example 2: static graph
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            # create network
+            x = paddle.static.data(name="x", shape=[None, 224], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [224, 10]:
+                    tensor = var.get_tensor()
+                    break
+
+            # save/load tensor
+            path_tensor = 'temp/tensor.pdtensor'
+            paddle.save(tensor, path_tensor)
+
+            # save/load state_dict
+            path_state_dict = 'temp/model.pdparams'
+            paddle.save(prog.state_dict("param"), path_tensor)
     '''
+    # 1. input check
+    filename = os.path.basename(path)
+    if filename == "":
+        raise ValueError("The input path MUST be format of dirname/filename "
+                         "[dirname\\filename in Windows system], but received "
+                         "filename is empty string.")
+
+    # 2. save object
+    dirname = os.path.dirname(path)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    config = _parse_save_config(configs)
+
+    if not isinstance(config.use_binary_format, bool):
+        raise TypeError(
+            "Type of `use_binary_format` should be bool, but received {}.".
+            format(type(config.use_binary_format)))
+
+    # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
+    if config.pickle_protocol is not None:
+        protocol = config.pickle_protocol
+        warnings.warn(
+            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+        )
+
+    if _use_legacy(obj):
+        if in_dygraph_mode():
+            _legacy_save(obj, path, protocol)
+        else:
+            _legacy_static_save(obj, path, protocol)
+    else:
+        # save single variable
+        with open(path, 'wb') as f:
+            _pickle_save(obj, f, protocol)
+
 
+def _legacy_save(obj, path, protocol=2):
     # 1. input check
     if not isinstance(obj, dict):
         raise NotImplementedError(
@@ -257,13 +481,13 @@ def save(obj, path, pickle_protocol=2):
                          "[dirname\\filename in Windows system], but received "
                          "filename is empty string.")
 
-    if not isinstance(pickle_protocol, int):
+    if not isinstance(protocol, int):
         raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(pickle_protocol)))
+            type(protocol)))
 
-    if pickle_protocol < 2 or pickle_protocol > 4:
+    if protocol < 2 or protocol > 4:
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(pickle_protocol))
+                         format(protocol))
 
     # 2. save object
     dirname = os.path.dirname(path)
@@ -274,19 +498,18 @@ def save(obj, path, pickle_protocol=2):
     if isinstance(obj, dict):
         saved_obj = _build_saved_state_dict(obj)
 
-    saved_obj = _unpack_saved_dict(saved_obj, pickle_protocol)
+    saved_obj = _unpack_saved_dict(saved_obj, protocol)
 
-    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3.5/6'
-    if sys.platform == 'darwin' and sys.version_info.major == 3 and (
-            sys.version_info.minor == 5 or sys.version_info.minor == 6):
-        pickle_bytes = pickle.dumps(saved_obj, protocol=pickle_protocol)
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(saved_obj, protocol=protocol)
         with open(path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
         with open(path, 'wb') as f:
-            pickle.dump(saved_obj, f, protocol=pickle_protocol)
+            pickle.dump(saved_obj, f, protocol=protocol)
 
 
 def load(path, **configs):
@@ -294,7 +517,7 @@ def load(path, **configs):
     Load an object can be used in paddle from specified path.
 
     .. note::
-        Now only supports load ``state_dict`` of Layer or Optimizer.
+        Now supports load ``state_dict`` of Layer or Optimizer, Tensor.
 
     .. note::
         In order to use the model parameters saved by paddle more efficiently, 
@@ -331,7 +554,9 @@ def load(path, **configs):
             ``save_inference_model`` save format. Default file name is :code:`__model__` . 
             (2) params_filename (str): The persistable variables file name of the paddle 1.x 
             ``save_inference_model`` save format. No default file name, save variables separately 
-            by default.
+            by default.            
+            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor. 
+            Default False.
 
     Returns:
         Object(Object): a target object can be used in paddle
@@ -341,20 +566,115 @@ def load(path, **configs):
 
             import paddle
 
+            # example 1: dynamic graph
+            import paddle
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
+
+            # save state_dict of emb
             paddle.save(layer_state_dict, "emb.pdparams")
-            scheduler = paddle.optimizer.lr.NoamDecay(	
+
+            scheduler = paddle.optimizer.lr.NoamDecay(
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
                 parameters=emb.parameters())
             opt_state_dict = adam.state_dict()
+
+            # save state_dict of optimizer
             paddle.save(opt_state_dict, "adam.pdopt")
+            # save weight of emb
+            paddle.save(emb.weight, "emb.weight.pdtensor")
 
+            # load state_dict of emb
             load_layer_state_dict = paddle.load("emb.pdparams")
+            # load state_dict of optimizer
             load_opt_state_dict = paddle.load("adam.pdopt")
+            # load weight of emb
+            load_weight = paddle.load("emb.weight.pdtensor")
+
+
+            # example 2: static graph
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            # create network
+            x = paddle.static.data(name="x", shape=[None, 224], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [224, 10]:
+                    tensor = var.get_tensor()
+                    break
+
+            # save/load tensor
+            path_tensor = 'temp/tensor.pdtensor'
+            paddle.save(tensor, path_tensor)
+            load_tensor = paddle.load(path_tensor)
+
+            # save/load state_dict
+            path_state_dict = 'temp/model.pdparams'
+            paddle.save(prog.state_dict("param"), path_tensor)
+            load_state_dict = paddle.load(path_tensor)
+
     '''
+
+    if os.path.isfile(path):
+        config = _parse_load_config(configs)
+        with open(path, 'rb') as f:
+            # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+            if sys.platform == 'darwin' and sys.version_info.major == 3:
+                load_result = _pickle_loads_mac(path, f)
+            else:
+                load_result = pickle.load(f) if six.PY2 else pickle.load(
+                    f, encoding='latin1')
+
+            # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
+            if isinstance(load_result, dict):
+                if isinstance(load_result, dict):
+                    load_result = _pack_loaded_dict(load_result)
+                # paddle2.0: paddle.save/load
+                if "StructuredToParameterName@@" in load_result:
+
+                    for key in load_result["StructuredToParameterName@@"]:
+                        load_result[key] = _ndarray_to_tensor(
+                            load_result[key], config.return_numpy)
+
+                    if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+                        del load_result["StructuredToParameterName@@"]
+                else:
+                    # paddle2.1 static.save/load
+                    for key in load_result:
+                        load_result[key] = _ndarray_to_tensor(
+                            load_result[key], config.return_numpy)
+
+            else:
+                # TODO(weixin): support complex objects such as layer.
+                # If `obj` is any object, the judgment condition should be more precise.
+                if _transformed_from_lodtensor(load_result):
+                    load_result = _ndarray_to_tensor(load_result,
+                                                     config.return_numpy)
+                elif _transformed_from_varbase(load_result):
+                    load_result = _tuple_to_tensor(load_result,
+                                                   config.return_numpy)
+                else:
+                    raise NotImplementedError(
+                        'Only support tensor and state_dict, but received {}.'.
+                        format(type(load_result)))
+
+    else:
+        load_result = _legacy_load(path, **configs)
+
+    return load_result
+
+
+def _legacy_load(path, **configs):
     load_result = None
     config = _parse_load_config(configs)
 
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index ac95fea151ed01..cd4b35ea29a83b 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -364,7 +364,7 @@ def on_train_begin(self, logs=None):
         }
         if self._is_print():
             print(
-                "The loss value printed in the log is the current step, and the metric is the average value of previous step."
+                "The loss value printed in the log is the current step, and the metric is the average value of previous steps."
             )
 
     def on_epoch_begin(self, epoch=None, logs=None):
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 137ca186d7946a..4f3d73b22e3902 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -41,8 +41,6 @@
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers import collective
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-from paddle.fluid.incubate.fleet.base import role_maker
 
 from paddle.io import DataLoader, Dataset, DistributedBatchSampler
 from paddle.fluid.executor import scope_guard, Executor
@@ -50,6 +48,8 @@
 from paddle.metric import Metric
 from paddle.static import InputSpec as Input
 import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.base import role_maker
 
 from .callbacks import config_callbacks, EarlyStopping
 from .model_summary import summary
@@ -252,6 +252,11 @@ def __init__(self, model):
         self._nranks = ParallelEnv().nranks
         self._local_rank = ParallelEnv().local_rank
 
+        self._amp_level = "O0"
+        self._amp_configs = {}
+        self._amp_custom_lists = {}
+        self._use_fp16_guard = True
+
     @property
     def mode(self):
         return self.model.mode
@@ -550,11 +555,26 @@ def _make_program(self, mode):
                 if self._nranks > 1:
                     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                     fleet.init(role)
-                    dist_strategy = DistributedStrategy()
-                    dist_strategy.mode = "collective"
-                    dist_strategy.collective_mode = "grad_allreduce"
+                    dist_strategy = fleet.DistributedStrategy()
+                    if self._amp_level != 'O0':
+                        dist_strategy.amp = True
+                        dist_strategy.amp_configs = self._amp_configs.copy()
+                        dist_strategy.amp_configs.update(self._amp_custom_lists)
+                        dist_strategy.amp_configs[
+                            'use_pure_fp16'] = self._amp_level == 'O2'
                     self.model._optimizer = fleet.distributed_optimizer(
                         self.model._optimizer, strategy=dist_strategy)
+                elif self._amp_level != "O0" and core.is_compiled_with_cuda:
+                    amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
+                        **self.
+                        _amp_custom_lists) if self._amp_custom_lists else None
+
+                    self.model._optimizer = paddle.static.amp.decorate(
+                        self.model._optimizer,
+                        amp_lists=amp_lists,
+                        use_pure_fp16=self._amp_level == "O2",
+                        use_fp16_guard=self._use_fp16_guard,
+                        **self._amp_configs)
 
                 self.model._optimizer.minimize(self._loss_endpoint)
 
@@ -598,6 +618,10 @@ def _compile_and_initialize(self, prog, mode):
                 startup_prog = self._startup_prog._prune(uninitialized)
                 self._executor.run(startup_prog)
 
+        if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
+        ):
+            self.model._optimizer.amp_init(place)
+
         if self._nranks < 2:
             compiled_prog = fluid.CompiledProgram(prog)
         else:
@@ -620,6 +644,11 @@ def __init__(self, model):
         }
 
         self._input_info = None
+        self._amp_level = "O0"
+        self._amp_configs = {}
+        self._amp_custom_lists = {}
+        self._use_fp16_guard = True
+
         if self._nranks > 1:
             dist.init_parallel_env()
             stradegy = fluid.dygraph.parallel.ParallelStrategy()
@@ -649,19 +678,30 @@ def train_batch(self, inputs, labels=None):
         labels = labels or []
         labels = [to_variable(l) for l in to_list(labels)]
 
-        if self._nranks > 1:
-            outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-        else:
-            outputs = self.model.network.forward(
-                * [to_variable(x) for x in inputs])
+        if self._amp_level != "O0":
+            scaler = paddle.amp.GradScaler(**self._amp_configs)
+        with paddle.amp.auto_cast(
+                enable=self._amp_level != 'O0', **self._amp_custom_lists):
+            if self._nranks > 1:
+                outputs = self.ddp_model.forward(
+                    * [to_variable(x) for x in inputs])
+            else:
+                outputs = self.model.network.forward(
+                    * [to_variable(x) for x in inputs])
 
-        losses = self.model._loss(*(to_list(outputs) + labels))
-        losses = to_list(losses)
-        final_loss = fluid.layers.sum(losses)
-        final_loss.backward()
+            losses = self.model._loss(*(to_list(outputs) + labels))
+            losses = to_list(losses)
+            final_loss = fluid.layers.sum(losses)
 
-        self.model._optimizer.minimize(final_loss)
-        self.model.network.clear_gradients()
+        if self._amp_level != "O0":
+            scaled = scaler.scale(final_loss)
+            scaled.backward()
+            scaler.minimize(self.model._optimizer, scaled)
+            self.model.network.clear_gradients()
+        else:
+            final_loss.backward()
+            self.model._optimizer.minimize(final_loss)
+            self.model.network.clear_gradients()
 
         metrics = []
         for metric in self.model._metrics:
@@ -816,6 +856,16 @@ class Model(object):
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
     must be required for static graph.
 
+    When training on GPU, auto mixed precision (AMP) training is supported, and
+    pure float16 training is also supported in static mode while using Adam,
+    AdamW and Momentum optimizer. Before using pure float16 training,
+    `multi_precision` could be set to True when creating optimizer, which can
+    avoid poor accuracy or slow convergence in a way, and inputs of dtype float
+    should be cast to float16 by users. Users should also use
+    `paddle.static.amp.fp16_guard` API to limit the range of pure float16
+    training, otherwise, 'use_fp16_guard' should be set to False by users.
+    However, limiting the range of is not supported during training using AMP.
+
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
@@ -830,6 +880,8 @@ class Model(object):
 
 
     Examples:
+        1. A common example
+
         .. code-block:: python
 
           import paddle
@@ -838,7 +890,7 @@ class Model(object):
           from paddle.static import InputSpec
   
           device = paddle.set_device('cpu') # or 'gpu'
-  
+
           net = nn.Sequential(
               nn.Flatten(1),
               nn.Linear(784, 200),
@@ -852,6 +904,7 @@ class Model(object):
           model = paddle.Model(net, input, label)
           optim = paddle.optimizer.SGD(learning_rate=1e-3,
               parameters=model.parameters())
+
           model.prepare(optim,
                         paddle.nn.CrossEntropyLoss(),
                         paddle.metric.Accuracy())
@@ -862,6 +915,43 @@ class Model(object):
           ])
           data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
           model.fit(data, epochs=2, batch_size=32, verbose=1)
+
+
+        2. An example using mixed precision training.
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import paddle.vision.transforms as T
+
+          def run_example_code():
+            device = paddle.set_device('gpu')
+
+            net = nn.Sequential(nn.Flatten(1), nn.Linear(784, 200), nn.Tanh(),
+                                nn.Linear(200, 10))
+
+            model = paddle.Model(net)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3, parameters=model.parameters())
+
+            amp_configs = {
+                "level": "O1",
+                "custom_white_list": {'conv2d'},
+                "use_dynamic_loss_scaling": True
+            }
+            model.prepare(optim,
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy(),
+                amp_configs=amp_configs)
+
+            transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
+            data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+            model.fit(data, epochs=2, batch_size=32, verbose=1)
+
+          # mixed precision training is only support on GPU now.
+          if paddle.is_compiled_with_cuda():
+            run_example_code()
+
     """
 
     def __init__(self, network, inputs=None, labels=None):
@@ -1241,7 +1331,94 @@ def parameters(self, *args, **kwargs):
         """
         return self._adapter.parameters()
 
-    def prepare(self, optimizer=None, loss=None, metrics=None):
+    def _prepare_amp(self, amp_configs):
+        def _check_pure_fp16_configs():
+            # pure float16 training has some restricts now
+            if self._adapter._amp_level == "O2":
+                if in_dygraph_mode():
+                    warnings.warn("Pure float16 training is not supported in dygraph mode now, "\
+                        "and it will be supported in future version.")
+                else:
+                    # grad clip is not supported in pure fp16 training now
+                    assert self._optimizer._grad_clip is None, \
+                        "Grad clip is not supported in pure float16 training now, and it will be supported in future version."
+
+        self._adapter._amp_custom_lists = {}
+        self._adapter._amp_configs = {}
+
+        # check and get level of mixed precision training
+        if not amp_configs:
+            self._adapter._amp_level = 'O0'
+            return
+        elif isinstance(amp_configs, str):
+            if amp_configs not in ('O0', 'O1', 'O2'):
+                raise ValueError(
+                    "The level of amp_configs should be 'O0', 'O1' or 'O2'.")
+            self._adapter._amp_level = amp_configs
+            _check_pure_fp16_configs()
+            return
+        else:
+            if 'level' not in amp_configs:
+                self._adapter._amp_level = 'O1'
+            elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
+                raise ValueError(
+                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
+            else:
+                self._adapter._amp_level = amp_configs['level']
+        amp_config_key_set = set(amp_configs.keys()) - {'level'}
+        if not amp_config_key_set or self._adapter._amp_level == 'O0':
+            return
+
+        if 'use_pure_fp16' in amp_configs:
+            raise ValueError(
+                "''use_pure_fp16' is an invalid parameter, "
+                "the level of mixed precision training only depends on 'O1' or 'O2'."
+            )
+
+        _check_pure_fp16_configs()
+
+        # construct amp_custom_lists
+        if self._adapter._amp_level != 'O0' and amp_config_key_set:
+            for param_name in [
+                    'custom_white_list', 'custom_black_list',
+                    'custom_black_varnames'
+            ]:
+                if param_name in amp_config_key_set:
+                    self._adapter._amp_custom_lists[param_name] = amp_configs[
+                        param_name]
+                    amp_config_key_set -= {param_name}
+
+        def _check_amp_configs(amp_config_key_set):
+            accepted_param_set = {
+                'init_loss_scaling',
+                'incr_ratio',
+                'decr_ratio',
+                'incr_every_n_steps',
+                'decr_every_n_nan_or_inf',
+                'use_dynamic_loss_scaling',
+                'use_fp16_guard',
+            }
+            if amp_config_key_set - accepted_param_set:
+                raise ValueError(
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, "
+                    "but {} could not be recognized.".format(
+                        tuple(amp_config_key_set - accepted_param_set)))
+
+            if 'use_fp16_guard' in amp_config_key_set:
+                if in_dygraph_mode():
+                    raise ValueError(
+                        "'use_fp16_guard' is supported in static mode only.")
+                self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
+                amp_config_key_set.remove('use_fp16_guard')
+
+            return amp_config_key_set
+
+        amp_configs_set = _check_amp_configs(amp_config_key_set)
+        for key in amp_configs_set:
+            self._adapter._amp_configs[key] = amp_configs[key]
+
+    def prepare(self, optimizer=None, loss=None, metrics=None,
+                amp_configs=None):
         """
         Configures the model before runing.
 
@@ -1255,7 +1432,23 @@ def prepare(self, optimizer=None, loss=None, metrics=None):
                 It can be None when there is no loss.
             metrics (Metric|list of Metric|None): If metrics is set, all
                 metrics will be calculated and output in train/eval mode.
-
+            amp_configs (str|dict|None): AMP configurations. If AMP or pure
+                float16 training is used, the key 'level' of 'amp_configs'
+                should be set to 'O1' or 'O2' respectively. Otherwise, the
+                value of 'level' defaults to 'O0', which means float32
+                training. In addition to 'level', users could pass in more
+                parameters consistent with mixed precision API. The supported
+                keys are: 'init_loss_scaling', 'incr_ratio', 'decr_ratio',
+                'incr_every_n_steps', 'decr_every_n_nan_or_inf',
+                'use_dynamic_loss_scaling', 'custom_white_list',
+                'custom_black_list', and 'custom_black_varnames'or
+                'use_fp16_guard' is only supported in static mode. Users could
+                refer to mixed precision API documentations
+                 :ref:`api_paddle_amp_auto_cast` and
+                 :ref:`api_paddle_amp_GradScaler` for details. For convenience,
+                'amp_configs' could be set to 'O1' or 'O2' if no more
+                parameters are needed. 'amp_configs' could be None in float32
+                training. Default: None.
         Returns:
             None
         """
@@ -1292,6 +1485,7 @@ def prepare(self, optimizer=None, loss=None, metrics=None):
                 "{} is not sub class of Metric".format(
                     metric.__class__.__name__)
         self._metrics = to_list(metrics)
+        self._prepare_amp(amp_configs)
 
         if not in_dygraph_mode():
             self._adapter.prepare()
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index c422bacdf78c7a..662515f0e52b1e 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -14,7 +14,6 @@
 
 from . import optimizer
 from ..fluid.contrib import reader
-from ..fluid import load_op_library
 from ..fluid.layer_helper import LayerHelper
 
 __all__ = []
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 3a552d588bed9c..79f21aadae69a5 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -151,6 +151,8 @@
 
 from .layer.vision import PixelShuffle
 
+from .layer.container import LayerDict  #DEFINE_ALIAS
+
 from .layer import loss  #DEFINE_ALIAS
 from .layer import conv  #DEFINE_ALIAS
 from .layer import vision  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 75dc62e530d0db..0789d0b67b7400 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -25,7 +25,7 @@
 import numpy as np
 from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
-from ...fluid import core, dygraph_utils
+from ...fluid import core, dygraph_utils, get_flags
 from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
@@ -112,10 +112,6 @@ def _conv_nd(x,
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
     origin_format = data_format
-    if origin_format == "NHWC" and op_type == "depthwise_conv2d":
-        x = nn.transpose(x, perm=[0, 3, 1, 2])
-        data_format = "NCHW"
-        channel_dim = 1
     if in_dygraph_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
@@ -159,10 +155,6 @@ def _conv_nd(x,
                        'use_mkldnn': use_mkldnn})
         else:
             out = pre_bias
-
-    if origin_format == "NHWC" and op_type == "depthwise_conv2d":
-        out = nn.transpose(out, perm=[0, 2, 3, 1])
-
     return out
 
 
@@ -559,6 +551,13 @@ def conv2d(x,
     if (num_channels == groups and num_channels != 1 and
             num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
+        if core.is_compiled_with_rocm():
+            use_cudnn = True
+        else:
+            use_cudnn = False
+
+    if (core.is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
+        ["FLAGS_conv2d_disable_cudnn"]):
         use_cudnn = False
 
     return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index bf389717518ce2..b88a2b042ff481 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -148,9 +148,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         sparse(bool): The flag indicating whether to use sparse update. This parameter only
             affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizers does not support sparse update,
-            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
-            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
-            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`.
             In these cases, sparse must be False. Default: False.
         padding_idx(int|long|None): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c223addc2607bf..6c8a2d1cbce850 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,7 +28,7 @@
 from ...fluid.layers import log_loss  #DEFINE_ALIAS
 from ...fluid.layers import npair_loss  #DEFINE_ALIAS
 from ...fluid.layers import reshape
-from ...fluid.layers import softmax_with_cross_entropy  #DEFINE_ALIAS
+from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy  #DEFINE_ALIAS
 from ...fluid.layers import square_error_cost  #DEFINE_ALIAS
 
 from ...fluid.layers import edit_distance  #DEFINE_ALIAS
@@ -36,6 +37,7 @@
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.framework import _varbase_creator
 from ...fluid.framework import Variable
+from paddle.utils import deprecated
 
 __all__ = [
     'binary_cross_entropy',
@@ -682,7 +684,6 @@ def l1_loss(input, label, reduction='mean', name=None):
 
             import paddle
 
-            paddle.disable_static()
             input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
             label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
@@ -1112,6 +1113,19 @@ def ctc_loss(log_probs,
     return loss_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.cross_entropy")
+def softmax_with_cross_entropy(logits,
+                               label,
+                               soft_label=False,
+                               ignore_index=-100,
+                               numeric_stable_mode=True,
+                               return_softmax=False,
+                               axis=-1):
+    return fluid_softmax_with_cross_entropy(logits, label, soft_label,
+                                            ignore_index, numeric_stable_mode,
+                                            return_softmax, axis)
+
+
 def cross_entropy(input,
                   label,
                   weight=None,
@@ -1119,87 +1133,248 @@ def cross_entropy(input,
                   reduction='mean',
                   soft_label=False,
                   axis=-1,
+                  use_softmax=True,
                   name=None):
     r"""
-    This operator implements the cross entropy loss function with softmax. This function 
+    By default, this operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable gradient.
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
+    to provide a more numerically stable computing. 
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
-    single label.
+    This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    The equation is as follows:
+    By default, this operator will calculate the mean of the result, and you can also affect 
+    the default behavior by using the reduction parameter. Please refer to the part of 
+    parameters for details.
 
-    1) Hard label (one-hot label, so every sample has exactly one class)
+    This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
-    .. math::
+    The calculation of this operator includes the following two steps.
 
-        loss_j =  -\\text{logits}_{label_j} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+    - **1.softmax cross entropy**
 
-    2) Soft label (each sample can have a distribution over all classes)
+        1. Hard label (each sample can only be assigned into one category)
 
-    .. math::
+        1.1. when use_softmax=True
 
-        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
-        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
-        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+            .. math::
+              \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N
 
- 
-    It is useful when training a classification problem with ``C`` classes.
+            where, N is the number of samples and C is the number of categories.
+
+        1.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+        2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1).
+
+        2.1. when use_softmax=True
+
+            .. math::
+              \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories.
+
+        2.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+
+
+    - **2. Weight and reduction processing**
+
+        1. Weight
+
+            If the ``weight`` parameter is ``None`` , go to the next step directly.
+
+            If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight
+            according to soft_label = False or True as follows.
+
+            1.1. Hard labels (soft_label = False)
+
+            .. math::
+                \\loss_j=loss_j*weight[label_j] 
 
 
+            1.2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right)
+
+        2. reduction
+
+            2.1 if the ``reduction`` parameter is ``none`` 
+
+                Return the previous result directly
+
+            2.2 if the ``reduction`` parameter is ``sum`` 
+
+                Return the sum of the previous results
+
+            .. math::
+               \\loss=\sum_{j}loss_j
+
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
+            the ``weight`` parameter as follows. 
+
+            2.3.1. If the  ``weight``  parameter is ``None`` 
+
+                   Return the average value of the previous results
+
+             .. math::
+                \\loss=\sum_{j}loss_j/N
+
+                  where, N is the number of samples and C is the number of categories.
+
+            2.3.2. If the 'weight' parameter is not 'None', the weighted average value of the previous result will be returned
+
+            1. Hard labels (soft_label = False)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+
+            2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
+ 
+ 
     Parameters:
-        input (Tensor): Input tensor, the data type is float32, float64. Shape is
-	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, D1, D2,..., Dk, C), k >= 1.
-        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
-	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
-	    (N, D1, D2,..., Dk), k >= 1.
-        weight (Tensor, optional):a manual rescaling weight given to each class. 
+
+        - **input** (Tensor)
+
+            Input tensor, the data type is float32, float64. Shape is
+	    :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+
+            Note: 
+
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                output of softmax operator, which will produce incorrect results.
+
+                2. when use_softmax=False, it expects the output of softmax operator.
+ 
+        - **label** (Tensor)
+
+            1. If soft_label=False, the shape is
+            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
+            the data type is int32, int64, float32, float64, where each value is [0, C-1].
+
+            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            and the sum of the labels for each sample should be 1.
+
+        - **weight** (Tensor, optional)
+
+            a manual rescaling weight given to each class. 
             If given, has to be a Tensor of size C and the data type is float32, float64. 
-            Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size,
+            Default is ``'None'`` .
+
+        - **ignore_index** (int64, optional)
+
+            Specifies a target value that is ignored
+            and does not contribute to the loss. A negative value means that no label 
+            value needs to be ignored. Only valid when soft_label = False.  
+            Default is ``-100`` .
+
+        - **reduction** (str, optional)
+
+            Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-        ignore_index (int64, optional): Specifies a target value that is ignored
-            and does not contribute to the input gradient. Default is ``-100``.
-        soft_label (bool): indicate whether label is soft. Default False, meaning that
-                the label is hard. If soft_label=True, the label is soft.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
-                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
-                              is the rank of input :attr:`logits`. Default: -1.
 
+        - **soft_label** (bool, optional)
+
+            Indicate whether label is soft. 
+            Default is ``False``.
+
+        - **axis** (int, optional)
+
+            The index of dimension to perform softmax calculations. 
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the 
+            number of dimensions of input :attr:`input`. 
+            Default is ``-1`` .
+
+        - **use_softmax** (bool, optional)
+
+            Indicate whether compute softmax before cross_entropy.
+            Default is ``True``.
+
+        - **name** (str，optional)
+
+            The name of the operator. Default is ``None`` .
+            For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Tensor.The tensor storing the cross_entropy_loss of input and label.
 
+        Tensor. Return the softmax cross_entropy loss of ``input`` and ``label``.
+        The data type is the same as input.
 
-    Examples:
-        .. code-block:: python
+        If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
 
-            import paddle
-            import numpy as np
+        If :attr:`reduction` is ``'none'``:
+
+        1. If soft_label = False, the dimension of return value is the same with ``label`` . 
 
-            input_data = np.random.random([5, 100]).astype("float64")
-            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
-            weight_data = np.random.random([100]).astype("float64")
+        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
 
-            input =  paddle.to_tensor(input_data)
-            label =  paddle.to_tensor(label_data)
-            weight = paddle.to_tensor(weight_data)
 
-            loss = paddle.nn.functional.cross_entropy(input=input, label=label, weight=weight)
-            print(loss)
-            # [4.28546723]
+     Example1(hard labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            N=100
+            C=200
+            reduction='mean'
+            input =  paddle.rand([N, C], dtype='float64')  
+            label =  paddle.randint(0, C, shape=[N], dtype='int64')
+            weight = paddle.rand([C], dtype='float64') 
+            
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=reduction)
+            dy_ret = cross_entropy_loss(
+                                       input,
+                                       label)
+            print(dy_ret.numpy()) #[5.41993642]
+
+
+    Example2(soft labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            axis = -1
+            ignore_index = -100
+            N = 4
+            C = 3
+            shape = [N, C]
+            reduction='mean'
+            weight = None
+            logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels /= paddle.sum(labels, axis=axis, keepdim=True)
+            paddle_loss_mean = paddle.nn.functional.cross_entropy(
+                                                                  logits,  
+                                                                  labels, 
+                                                                  soft_label=True, 
+                                                                  axis=axis,
+                                                                  weight=weight,
+                                                                  reduction=reduction)
+            print(paddle_loss_mean.numpy()) #[1.12908343]
+
     """
 
     if reduction not in ['sum', 'mean', 'none']:
@@ -1207,6 +1382,14 @@ def cross_entropy(input,
             "The value of 'reduction' in softmax_cross_entropy"
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
             % reduction)
+    if ignore_index > 0 and soft_label == True:
+        raise ValueError(
+            "When soft_label == True, the value of 'ignore_index' in softmax_cross_entropy"
+            "should be '-100', but received %s, which is not allowed." %
+            ignore_index)
+
+    softmax_switch = use_softmax
+
     input_dims = len(list(input.shape))
     label_dims = len(list(label.shape))
     if input_dims - 1 != label_dims and input_dims != label_dims:
@@ -1216,27 +1399,46 @@ def cross_entropy(input,
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
     if in_dygraph_mode():
-        out = softmax_with_cross_entropy(
-            input,
-            label,
-            soft_label=soft_label,
-            ignore_index=ignore_index,
-            axis=axis)
+        _, out = core.ops.softmax_with_cross_entropy(
+            input, label, 'soft_label', soft_label, 'ignore_index',
+            ignore_index, 'numeric_stable_mode', True, 'axis', axis,
+            'softmax_switch', softmax_switch)
+
         if weight is not None:
-            weight_gather = core.ops.gather_nd(
-                weight, label)  #trans weight from class to sample, shape:N
-            input_shape = list(label.shape)
-            weight_gather_reshape = reshape(weight_gather, shape=input_shape)
-            out = core.ops.elementwise_mul(out, weight_gather_reshape)
+
+            #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+            if soft_label == True:
+                # chajchaj:
+                # weight's shape is C, where C is class num.
+                # for 1d case: label's shape is [N,C], weight_gather's shape is N.
+                # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
+                weight_gather = paddle.matmul(
+                    x=paddle.cast(label, weight.dtype),
+                    y=weight,
+                    transpose_x=False,
+                    transpose_y=True)
+                out_shape = list(out.shape)
+                weight_gather_reshape = reshape(weight_gather, shape=out_shape)
+                out = paddle.cast(out, weight_gather_reshape.dtype)
+
+                out = core.ops.elementwise_mul(out, weight_gather_reshape)
+
+            else:
+                weight_gather = core.ops.gather_nd(weight, label)
+                input_shape = list(label.shape)
+                weight_gather_reshape = reshape(
+                    weight_gather, shape=input_shape)
+                out = paddle.cast(out, weight_gather_reshape.dtype)
+                out = core.ops.elementwise_mul(out, weight_gather_reshape)
 
         if reduction == "sum":
-            #   because of softmax_with_cross_entropy op's inner logic, 
+            #   because of fluid_softmax_with_cross_entropy op's inner logic, 
             #   in the out tensor of this op, the loss of sample with class_index==ignore_index is 0
             #   so, reduce_sum all directly is ok
             return core.ops.reduce_sum(out, 'reduce_all', True)
         elif reduction == "mean":
             #1. if weight==none, 
-            #    numerator: reduce_sum all loss directly is ok causeof softmax_with_cross_entropy's inner logic
+            #    numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic
             #    denominator: count sample num with class_index!=ignore_index
             #2. else
             #    numerator: loss's weighted sum 
@@ -1247,7 +1449,7 @@ def cross_entropy(input,
                 #mask[i]=0, if label[i]==ignore_index
                 #mask[i]=1, otherwise 
                 mask = (label != ignore_index)
-                if (weight is None):
+                if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
                     count = core.ops.reduce_sum(mask, 'reduce_all', True)
                     ret = out_sum / count
@@ -1277,20 +1479,48 @@ def cross_entropy(input,
     fluid.data_feeder.check_variable_and_dtype(
         label, 'label', ['int32', 'int64', 'float32', 'float64'],
         'softmax_cross_entropy')
-    out = softmax_with_cross_entropy(
-        input,
-        label,
-        soft_label=soft_label,
-        ignore_index=ignore_index,
-        axis=axis)
+    attrs = {
+        'soft_label': soft_label,
+        'ignore_index': ignore_index,
+        'numeric_stable_mode': True,
+        'axis': axis,
+        'softmax_switch': softmax_switch
+    }
+    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': input,
+                'Label': label},
+        outputs={'Softmax': softmax,
+                 'Loss': out},
+        attrs=attrs)
+
     if weight is not None:
         fluid.data_feeder.check_variable_and_dtype(
             weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy')
         weight_name = name if reduction == 'none' else None
-        weight_gather = paddle.gather_nd(
-            weight, label)  #trans weight from class to sample, shape:N
-        input_shape = list(label.shape)
-        weight_gather_reshape = reshape(weight_gather, shape=input_shape)
+        if soft_label == True:
+            # chajchaj:
+            #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+            # weight's shape is C, where C is class num.
+            # for 1d case: label's shape is [N,C], weight_gather's shape is N.
+            # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
+            weight_gather = paddle.matmul(
+                x=paddle.cast(label, weight.dtype),
+                y=weight,
+                transpose_x=False,
+                transpose_y=True)
+
+            out_shape = list(out.shape)
+            weight_gather_reshape = reshape(weight_gather, shape=out_shape)
+            out = paddle.cast(out, weight_gather_reshape.dtype)
+        else:
+            weight_gather = paddle.gather_nd(
+                weight, label)  #trans weight from class to sample, shape:N
+            input_shape = list(label.shape)
+            weight_gather_reshape = reshape(weight_gather, shape=input_shape)
         out = paddle.multiply(out, weight_gather_reshape, name=weight_name)
 
     if reduction == "sum":
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 9e04095e7b7988..032d5b47eda077 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -119,6 +119,8 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
         use_cudnn = True
     else:
         use_cudnn = False
+    if core.is_compiled_with_rocm():
+        use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
 
     if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
             isinstance(out_shape, Variable)):
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 13fdde070874ab..17c4ca5c5d11d2 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -23,6 +23,7 @@
 from . import vision
 from . import distance
 from . import transformer
+from . import container
 
 from .activation import *
 from .loss import *
@@ -99,3 +100,4 @@
 
 from .vision import PixelShuffle  #DEFINE_ALIAS
 from .distance import PairwiseDistance  #DEFINE_ALIAS
+from .container import LayerDict  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 60c846f9f76ec0..86a6fae0d6857f 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1229,7 +1229,7 @@ class Embedding(layers.Layer):
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
     This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
     It automatically constructs a 2D embedding matrix based on the
-    input :attr:`num_embeddings` and attr:`embedding_dim`.
+    input :attr:`num_embeddings` and :attr:`embedding_dim`.
 
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
@@ -1241,9 +1241,9 @@ class Embedding(layers.Layer):
 
         Case 1:
 
-        input is a Tensor. padding_idx = -1
-            input.data = [[1, 3], [2, 4], [4, 127]
-            input.shape = [3, 2]
+        x is a Tensor. padding_idx = -1
+            x.data = [[1, 3], [2, 4], [4, 127]
+            x.shape = [3, 2]
         Given size = [128, 16]
         output is a Tensor:
             out.shape = [3, 2, 16]
@@ -1261,7 +1261,7 @@ class Embedding(layers.Layer):
     Parameters:
         num_embeddings (int): Just one element which indicate the size
             of the dictionary of embeddings.
-        embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
+        embedding_dim (int):  Just one element which indicate the size of each embedding vector respectively.
         padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
@@ -1270,9 +1270,7 @@ class Embedding(layers.Layer):
         sparse(bool): The flag indicating whether to use sparse update. This parameter only
             affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizer does not support sparse update,
-            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
-            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
-            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`.
             In these case, sparse must be False. Default: False.
         weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
new file mode 100644
index 00000000000000..db317839ae818a
--- /dev/null
+++ b/python/paddle/nn/layer/container.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from ...fluid.dygraph.layers import Layer
+from six.moves import collections_abc
+
+__all__ = ['LayerDict', ]
+
+
+class LayerDict(Layer):
+    """
+    LayerDict holds sublayers in the ordered dictionary, and sublayers it contains are properly registered.
+    Holded sublayers can be accessed like a regular ordered python dictionary. 
+
+    Parameters:
+        sublayers (LayerDict|OrderedDict|list[(key,Layer)...], optional): iterable of key/value pairs, the type of value is 'paddle.nn.Layer' .
+
+    Examplex:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            from collections import OrderedDict
+
+            sublayers = OrderedDict([
+                ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+            ])
+
+            layers_dict = paddle.nn.LayerDict(sublayers=sublayers)
+
+            l = layers_dict['conv1d']
+
+            for k in layers_dict:
+                l = layers_dict[k]
+
+            len(layers_dict)
+            #3
+
+            del layers_dict['conv2d']
+            len(layers_dict)
+            #2
+
+            conv1d = layers_dict.pop('conv1d')
+            len(layers_dict)
+            #1
+
+            layers_dict.clear()
+            len(layers_dict)
+            #0
+
+    """
+
+    def __init__(self, sublayers=None):
+        super(LayerDict, self).__init__()
+        if sublayers is not None:
+            self.update(sublayers)
+
+    def __getitem__(self, key):
+        return self._sub_layers[key]
+
+    def __setitem__(self, key, sublayer):
+        return self.add_sublayer(key, sublayer)
+
+    def __delitem__(self, key):
+        del self._sub_layers[key]
+
+    def __len__(self):
+        return len(self._sub_layers)
+
+    def __iter__(self):
+        return iter(self._sub_layers)
+
+    def __contains__(self, key):
+        return key in self._sub_layers
+
+    def clear(self):
+        """
+        Clear all the sublayers in the LayerDict.
+
+        Parameters:
+            None.
+
+        Examplex:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                len(layer_dict)
+                #3
+
+                layer_dict.clear()
+                len(layer_dict)
+                #0
+
+        """
+        self._sub_layers.clear()
+
+    def pop(self, key):
+        """
+        Remove the key from the LayerDict and return the layer of the key.
+
+        Parameters:
+            key (str): the key to be removed.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                len(layer_dict)
+                #3
+
+                layer_dict.pop('conv2d')
+                len(layer_dict)
+                #2
+
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self):
+        """
+        Return the iterable of the keys in LayerDict.
+
+        Parameters:
+            None.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                for k in layer_dict.keys():
+                    print(k)
+                
+                #conv1d
+                #conv2d
+                #conv3d
+
+        """
+        return self._sub_layers.keys()
+
+    def items(self):
+        """
+        Return the iterable of the key/value pairs in LayerDict.
+
+        Parameters:
+            None.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                for k, v in layer_dict.items():
+                    print(k, ":", v)
+
+                #conv1d : Conv1D(3, 2, kernel_size=[3], data_format=NCL)
+                #conv2d : Conv2D(3, 2, kernel_size=[3, 3], data_format=NCHW)
+                #conv3d : Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)
+
+        """
+        return self._sub_layers.items()
+
+    def values(self):
+        """
+        Return the iterable of the values in LayerDict.
+
+        Parameters:
+            None.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                for v in layer_dict.values():
+                    print(v)
+
+                #Conv1D(3, 2, kernel_size=[3], data_format=NCL)
+                #Conv2D(3, 2, kernel_size=[3, 3], data_format=NCHW)
+                #Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)
+
+        """
+        return self._sub_layers.values()
+
+    def update(self, sublayers):
+        """
+        Update the key/values pairs in sublayers to the LayerDict, overwriting the existing keys.
+
+        Parameters:
+            sublayers (LayerDict|OrderedDict|list[(key,Layer)...]): iterable of key/value pairs, the type of value is 'paddle.nn.Layer' .
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                new_sublayers = OrderedDict([
+                    ('relu', paddle.nn.ReLU()),
+                    ('conv2d', paddle.nn.Conv2D(4, 2, 4)),
+                ])
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+
+                layer_dict.update(new_sublayers)
+                
+                for k, v in layer_dict.items():
+                    print(k, ":", v)
+                #conv1d : Conv1D(3, 2, kernel_size=[3], data_format=NCL)
+                #conv2d : Conv2D(4, 2, kernel_size=[4, 4], data_format=NCHW)
+                #conv3d : Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)
+                #relu : ReLU()
+
+        """
+
+        assert isinstance(
+            sublayers, collections_abc.Iterable
+        ), "The type of sublayers is not iterable of key/value pairs, the type of sublayers is " + type(
+            sublayers).__name__
+
+        if isinstance(sublayers,
+                      (OrderedDict, LayerDict, collections_abc.Mapping)):
+            for key, layer in sublayers.items():
+                self.add_sublayer(key, layer)
+        else:
+            # handle this format [(key1, layer1), (key2, layer2)...]
+            for i, kv in enumerate(sublayers):
+                if len(kv) != 2:
+                    raise ValueError("The length of the " + str(i) +
+                                     "'s element in sublayers is " + str(
+                                         len(kv)) + ", which must be 2.")
+                self.add_sublayer(kv[0], kv[1])
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index d65b874c8badc6..55f5b823e35f4a 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -153,6 +153,13 @@ def _get_default_param_initializer():
                                           in_channels != 1 and
                                           out_channels % in_channels == 0):
             self._op_type = 'depthwise_conv2d'
+            if core.is_compiled_with_rocm():
+                self._use_cudnn = True
+            else:
+                self._use_cudnn = False
+
+        if (core.is_compiled_with_cuda() and get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
             self._use_cudnn = False
 
     def extra_repr(self):
@@ -645,10 +652,6 @@ def __init__(self,
             bias_attr=bias_attr,
             data_format=data_format)
 
-        if (core.is_compiled_with_cuda() and get_flags(
-                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
-            self._use_cudnn = False
-
     def forward(self, x):
         if self._padding_mode != 'zeros':
             x = F.pad(x,
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index ac1cb5a8187720..ad046b90417509 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -108,7 +109,6 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
 
         .. code-block:: python
             import paddle
-            paddle.disable_static()
             logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
             label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
             bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
@@ -142,85 +142,249 @@ def forward(self, logit, label):
 
 class CrossEntropyLoss(fluid.dygraph.Layer):
     r"""
-    This operator implements the cross entropy loss function with softmax. This function 
+    By default, this operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable gradient.
+    to provide a more numerically stable computing.
 
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
+    This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
-    single label.
+    By default, this operator will calculate the mean of the result, and you can also affect 
+    the default behavior by using the reduction parameter. Please refer to the part of 
+    parameters for details.
 
-    The equation is as follows:
+    This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
-    1) Hard label (one-hot label, so every sample has exactly one class)
+    The calculation of this operator includes the following two steps.
 
-    .. math::
+    -  **I.softmax cross entropy** 
 
-        loss_j =  -\\text{logits}_{label_j} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+        1. Hard label (each sample can only be assigned into one category)
 
-    2) Soft label (each sample can have a distribution over all classes)
+        1.1. when use_softmax=True
 
-    .. math::
+            .. math::
+              \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N
 
-        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
-        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
-        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+            where, N is the number of samples and C is the number of categories.
+
+        1.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+        2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1).
+
+        2.1. when use_softmax=True
+
+            .. math::
+              \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories.
+
+        2.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+
+    -  **II.Weight and reduction processing** 
+
+        1. Weight
+
+            If the ``weight`` parameter is ``None`` , go to the next step directly.
+
+            If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight
+            according to soft_label = False or True as follows.
+
+            1.1. Hard labels (soft_label = False)
+
+            .. math::
+                \\loss_j=loss_j*weight[label_j] 
 
- 
-    It is useful when training a classification problem with ``C`` classes.
 
+            1.2. Soft labels (soft_label = True)
 
+             .. math::
+                \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right)
+
+        2. reduction
+
+            2.1 if the ``reduction`` parameter is ``none`` 
+
+            Return the previous result directly
+
+            2.2 if the ``reduction`` parameter is ``sum`` 
+
+            Return the sum of the previous results
+
+            .. math::
+               \\loss=\sum_{j}loss_j
+
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
+            the ``weight`` parameter as follows. 
+
+            2.3.1. If the  ``weight``  parameter is ``None`` 
+
+            Return the average value of the previous results
+
+             .. math::
+                \\loss=\sum_{j}loss_j/N
+
+            where, N is the number of samples and C is the number of categories.
+
+            2.3.2. If the 'weight' parameter is not 'None', the weighted average value of the previous result will be returned
+
+            1. Hard labels (soft_label = False)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+
+            2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
+ 
+ 
     Parameters:
-        input (Tensor): Input tensor, the data type is float32, float64. Shape is
-	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, C, D1, D2,..., Dk), k >= 1.
-        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
-	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
-	    (N, D1, D2,..., Dk), k >= 1.
-        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
-            to each class and the shape is (C). It has the same dimensions as class
-	    number and the data type is float32, float64. Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size,
+
+        - **weight** (Tensor, optional)
+
+            a manual rescaling weight given to each class. 
+            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            Default is ``'None'`` .
+
+        - **ignore_index** (int64, optional)
+
+            Specifies a target value that is ignored
+            and does not contribute to the loss. A negative value means that no label 
+            value needs to be ignored. Only valid when soft_label = False.  
+            Default is ``-100`` .
+
+        - **reduction** (str, optional)
+
+            Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-        ignore_index (int64, optional): Specifies a target value that is ignored
-            and does not contribute to the input gradient. Default is ``-100``.
-        soft_label (bool): indicate whether label is soft. Default False, meaning that
-                the label is hard. If soft_label=True, the label is soft.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
-                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
-                              is the rank of input :attr:`logits`. Default: -1.
 
+        - **soft_label** (bool, optional)
 
-    Returns:
-        Tensor. The tensor storing the cross_entropy_loss of input and label.
+            Indicate whether label is soft. 
+            If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
+            Default is ``False``.
 
+        - **axis** (int, optional)
+
+            The index of dimension to perform softmax calculations. 
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number 
+            of dimensions of input :attr:`input`. 
+            Default is ``-1`` .
+
+        - **use_softmax** (bool, optional)
+
+            Indicate whether compute softmax before cross_entropy.
+            Default is ``True``.
+
+        - **name** (str，optional)
+
+            The name of the operator. Default is ``None`` .
+            For more information, please refer to :ref:`api_guide_Name` .
+
+
+    Shape:
+
+        - **input** (Tensor)
+
+            Input tensor, the data type is float32, float64. Shape is
+	    :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+
+            Note: 
+
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                output of softmax operator, which will produce incorrect results.
+
+                2. when use_softmax=False, it expects the output of softmax operator.
+ 
+
+        - **label** (Tensor)
+
+            1. If soft_label=False，the shape is 
+            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
+            the data type is int32, int64, float32, float64, where each value is [0, C-1].
+
+            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            and the sum of the labels for each sample should be 1.
+ 
+        - **output** (Tensor)
+
+            Return the softmax cross_entropy loss of ``input`` and ``label``.
+
+            The data type is the same as input.
+
+            If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
+
+            If :attr:`reduction` is ``'none'``:
+
+            1. If soft_label = False, the dimension of return value is the same with ``label`` . 
+
+            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+
+     Example1(hard labels):
 
-    Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
+            paddle.seed(99999)
+            N=100
+            C=200
+            reduction='mean'
+            input =  paddle.rand([N, C], dtype='float64')  
+            label =  paddle.randint(0, C, shape=[N], dtype='int64')
+            weight = paddle.rand([C], dtype='float64') 
+            
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=reduction)
+            dy_ret = cross_entropy_loss(
+                                       input,
+                                       label)
+            print(dy_ret.numpy()) #[5.41993642]
+
+
+    Example2(soft labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            axis = -1
+            ignore_index = -100
+            N = 4
+            C = 3
+            shape = [N, C]
+            reduction='mean'
+            weight = None
+            logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels /= paddle.sum(labels, axis=axis, keepdim=True)
+            paddle_loss_mean = paddle.nn.functional.cross_entropy(
+                                                                  logits,  
+                                                                  labels, 
+                                                                  soft_label=True, 
+                                                                  axis=axis,
+                                                                  weight=weight,
+                                                                  reduction=reduction)
+            print(paddle_loss_mean.numpy()) #[1.12908343]
 
-            input_data = paddle.uniform([5, 100], dtype="float64")
-            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
-            weight_data = np.random.random([100]).astype("float64")
-            input =  paddle.to_tensor(input_data)
-            label =  paddle.to_tensor(label_data)
-            weight = paddle.to_tensor(weight_data)
-            ce_loss = paddle.nn.CrossEntropyLoss(weight=weight, reduction='mean')
-            output = ce_loss(input, label)
-            print(output)
-            # [4.84496039]
     """
 
     def __init__(self,
@@ -229,6 +393,7 @@ def __init__(self,
                  reduction='mean',
                  soft_label=False,
                  axis=-1,
+                 use_softmax=True,
                  name=None):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
@@ -236,6 +401,7 @@ def __init__(self,
         self.ignore_index = ignore_index
         self.soft_label = soft_label
         self.axis = axis
+        self.use_softmax = use_softmax
         self.name = name
 
     def forward(self, input, label):
@@ -247,6 +413,7 @@ def forward(self, input, label):
             reduction=self.reduction,
             soft_label=self.soft_label,
             axis=self.axis,
+            use_softmax=self.use_softmax,
             name=self.name)
 
         return ret
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 5830af3a182d4f..cdb87a1cb39207 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -33,7 +33,7 @@
 
 
 class AvgPool1D(layers.Layer):
-    """
+    r"""
     This operation applies a 1D average pooling over an input signal composed
     of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
@@ -41,36 +41,33 @@ class AvgPool1D(layers.Layer):
     The output tensor shape will be [N, C, output_size].
 
     The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    output (N, C, :math:`L_{out}`) and kernel_size ksize can be precisely described as
     For average pool1d:
 
     ..  math::
 
-       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+        Output(N_i, C_i, l) = \frac{Input[N_i, C_i, stride \times l:stride \times l+k]}{ksize}
 
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain an integer.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain an integer.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer. Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is `True`.
-        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. The default value is False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        exclusive(bool, optional): Whether to exclude padding points in average pooling mode, default is `True`.
+        ceil_mode(bool, optional): ${ceil_mode_comment}Whether to use the ceil function to calculate output height
+            and width. If it is set to False, the floor function will be used. The default value is False.
+        name(str, optional): For eed to detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no nset and None by default.
 
     Returns:
-        None.
+        A callable object of AvgPool1D.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -79,23 +76,24 @@ class AvgPool1D(layers.Layer):
         ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
-
     Shape:
-        - inpuut: 3-D tensor.
-        - output: 3-D tensor
+        - x(Tensor): The input tensor of avg pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool1d  operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
-          pool_out = AvgPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
+            pool_out = AvgPool1D(data)
+            # pool_out shape: [1, 3, 16]
 
     """
 
@@ -132,49 +130,53 @@ class AvgPool2D(layers.Layer):
     H is the height of the feature, and W is the width of the feature.
 
     Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-           $$
-
-    Args:
-       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        Input:
+            X shape: :math:`(N, C, :math:`H_{in}`, :math:`W_{in}`)`
+        Attr:
+            kernel_size: ksize
+
+        Output:
+            Out shape: :math:`(N, C, :math:`H_{out}`, :math:`W_{out}`)`
+
+        ..  math::
+
+            Output(N_i, C_j, h, w)  = \frac{\sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)}{ksize[0] * ksize[1]}
+
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
             4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): When True, will use `ceil` instead of `floor` to compute the output shape.
+        exclusive(bool, optional): Whether to exclude padding points in average pooling
+            mode, default is `true`.
+        divisor_override(float, optional): If specified, it will be used as divisor, otherwise kernel_size will be
+            used. Default None.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`,
+            `"NDHW"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Shape:
-        - x: 4-D tensor.
-        - out: 2-D tensor
+        - x(Tensor): The input tensor of avg pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool2d  operator, which is a 4-D tensor.
+          The data type is same as input x.
 
-    Returns: None.
+    Returns:
+        A callable object of AvgPool2D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
@@ -182,16 +184,16 @@ class AvgPool2D(layers.Layer):
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          AvgPool2D = nn.AvgPool2D(kernel_size=2,
+            # max pool2d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            AvgPool2D = nn.AvgPool2D(kernel_size=2,
                                 stride=2, padding=0)
-          output = AvgPool2D(input)
-          # output.shape [1, 3, 16, 16]
+            output = AvgPool2D(input)
+            # output.shape [1, 3, 16, 16]
 
     """
 
@@ -238,61 +240,64 @@ class AvgPool3D(layers.Layer):
     in NCDHW format, where N is batch size, C is the number of channels,
     H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
             4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): ${ceil_mode_comment}
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        ceil_mode(bool, optional): ${ceil_mode_comment}
+        exclusive(bool, optional): Whether to exclude padding points in average pooling mode, default is True.
+        divisor_override(int|float, optional): if specified, it will be used as divisor, otherwise kernel_size will
+            be used. Default None.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`,
+             `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+             `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+             to :ref:`api_guide_Name`. Usually name is no need to set and
+             None by default.
 
-    Returns: None.
+    Returns:
+        A callable object of AvgPool3D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 5-D tensor.
-        - out: 5-D tensor.
-
+        - x(Tensor): The input tensor of avg pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
+          The data type is same as input x.
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # avg pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          AvgPool3D = nn.AvgPool3D(kernel_size=2,
+            # avg pool3d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+            AvgPool3D = nn.AvgPool3D(kernel_size=2,
                                    stride=2, padding=0)
-          output = AvgPool3D(input)
-          # output.shape [1, 2, 3, 16, 16]
+            output = AvgPool3D(input)
+            # output.shape [1, 2, 3, 16, 16]
 
     """
 
     def __init__(self,
                  kernel_size,
-                 stride,
+                 stride=None,
                  padding=0,
                  ceil_mode=False,
                  exclusive=True,
@@ -328,10 +333,11 @@ def extra_repr(self):
 
 class MaxPool1D(layers.Layer):
     """
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_mask parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
+    This operation applies 1D max pooling over input signal
+    composed of several input planes based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCL format, where N is batch size, C is the number of channels,
+    L is the length of the feature.
 
     The output value of the layer with input size (N, C, L),
     output (N, C, L_{out}) and kernel_size k can be precisely described as
@@ -339,28 +345,27 @@ class MaxPool1D(layers.Layer):
 
     ..  math::
 
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+        Output(N_i, C_i, l) =  max(Input[N_i, C_i, stride \times l:stride \times l+k])
 
-    Args:
-       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain an integer.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain an integer.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer. Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
-            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
-            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            4. A list[int] or tuple(int) whose length is 2, It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or(0,0).
             The default value is 0.
-        return_mask (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        return_mask(bool, optional): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode(bool, optional): Whether to use the ceil function to calculate output height and width.
+            False is the default. If it is set to False, the floor function will be used. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Returns:
-        None.
+        A callable object of MaxPool1D.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -371,25 +376,27 @@ class MaxPool1D(layers.Layer):
 
 
     Shape:
-        - x: 3-D tensor.
-        - out: 3-D tensor.
+        - x(Tensor): The input tensor of max pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool1d  operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
-          pool_out = MaxPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
+            pool_out = MaxPool1D(data)
+            # pool_out shape: [1, 3, 16]
 
-          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          pool_out, indices = MaxPool1D(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+            MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            pool_out, indices = MaxPool1D(data)
+            # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
@@ -426,70 +433,73 @@ class MaxPool2D(layers.Layer):
     H is the height of the feature, and W is the width of the feature.
 
     Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        - Input:
+            X shape: :math:`(N, C, H_{in}, W_{in})`
+        - Attr:
+            kernel_size: ksize
+
+        - Output:
+            Out shape: :math:`(N, C, H_{out}, W_{out})`
+
+        ..  math::
+
+            Output(N_i, C_j, h, w) = \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1}
+                Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
-            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            4. A list[int] or tuple(int) whose length is \4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_mask (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_mask(bool, optional): Whether to return the max indices along with the outputs.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
-    Returns: None
+    Returns:
+        A callable object of MaxPool2D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 4-D tensor.
-        - out: 4-D tensor.
+        - x(Tensor): The input tensor of max pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool2d  operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          MaxPool2D = nn.MaxPool2D(kernel_size=2,
+            # max pool2d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            MaxPool2D = nn.MaxPool2D(kernel_size=2,
                                    stride=2, padding=0)
-          output = MaxPool2D(input)
-          # output.shape [1, 3, 16, 16]
+            output = MaxPool2D(input)
+            # output.shape [1, 3, 16, 16]
 
-          # for return_mask=True
-          MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          output, max_indices = MaxPool2D(input)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+            # for return_mask=True
+            MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            output, max_indices = MaxPool2D(input)
+            # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
     def __init__(self,
@@ -532,59 +542,62 @@ class MaxPool3D(layers.Layer):
     in NCDHW format, where N is batch size, C is the number of channels,
     H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
-            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            4. A list[int] or tuple(int) whose length is \6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): ${ceil_mode_comment}
-        return_mask (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): ${ceil_mode_comment}
+        return_mask(bool, optional): Whether to return the max indices along with the outputs.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
 
-    Returns:None.
+    Returns:
+        A callable object of MaxPool3D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 5-D tensor.
-        - out: 5-D tensor.
+        - x(Tensor): The input tensor of max pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool3d  operator, which is a 5-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          MaxPool3D = nn.MaxPool3D(kernel_size=2,
+            # max pool3d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+            MaxPool3D = nn.MaxPool3D(kernel_size=2,
                                    stride=2, padding=0)
-          output = MaxPool3D(input)
-          # output.shape [1, 2, 3, 16, 16]
+            output = MaxPool3D(input)
+            # output.shape [1, 2, 3, 16, 16]
 
-          # for return_mask=True
-          MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          output, max_indices = MaxPool3D(input)
-          # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
+            # for return_mask=True
+            MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            output, max_indices = MaxPool3D(input)
+            # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
     """
 
     def __init__(self,
@@ -633,51 +646,52 @@ class AdaptiveAvgPool1D(layers.Layer):
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
+        lstart &= floor(i * L_{in} / L_{out})
 
-       lend &= ceil((i + 1) * L_{in} / L_{out})
+        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+        Output(i) &= \frac{ \sum Input[lstart:lend]}{lend - lstart}
 
-    Args:
-        output_size (int): The target output size. It must be an integer.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+    Parameters:
+        output_size(int): The target output size. It must be an integer.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Returns:
-        None.
+        A callable object of AdaptiveAvgPool1D.
 
     Raises:
         ValueError: 'output_size' should be an integer.
 
     Shape:
-        - x: 3-D tensor.
-        - out: 3-D tensor.
+        - x(Tensor): 3-D tensor. The input tensor of adaptive avg pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): 3-D tensor. The output tensor of adaptive avg pool1d operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          # average adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-          #
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
-          pool_out = AdaptiveAvgPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            # average adaptive pool1d
+            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            # output shape is [N, C, m], adaptive pool divide L dimension
+            # of input data into m grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         lstart = floor(i * L / m)
+            #         lend = ceil((i + 1) * L / m)
+            #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lend - lstart)
+            #
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
+            pool_out = AdaptiveAvgPool1D(data)
+            # pool_out shape: [1, 3, 16]
     """
 
     def __init__(self, output_size, name=None):
@@ -702,31 +716,32 @@ class AdaptiveAvgPool2D(layers.Layer):
 
     ..  math::
 
-       hstart &= floor(i * H_{in} / H_{out})
+        hstart &= floor(i * H_{in} / H_{out})
 
-       hend &= ceil((i + 1) * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out})
 
-       wstart &= floor(j * W_{in} / W_{out})
+        wstart &= floor(j * W_{in} / W_{out})
 
-       wend &= ceil((j + 1) * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out})
 
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+        Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
 
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two element, (H, W). H and W can be either a int, or None which means
             the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format(str, optional): The data format of the input and output data. An optional string
             from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
             the order of: [batch_size, input_channels, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Shape:
-        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveAvgPool2D.
@@ -787,34 +802,36 @@ class AdaptiveAvgPool3D(layers.Layer):
 
     ..  math::
 
-      dstart &= floor(i * D_{in} / D_{out})
+        dstart &= floor(i * D_{in} / D_{out})
 
-      dend &= ceil((i + 1) * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out})
 
-      hstart &= floor(j * H_{in} / H_{out})
+        hstart &= floor(j * H_{in} / H_{out})
 
-      hend &= ceil((j + 1) * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out})
 
-      wstart &= floor(k * W_{in} / W_{out})
+        wstart &= floor(k * W_{in} / W_{out})
 
-      wend &= ceil((k + 1) * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out})
 
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+        Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
+            {(dend - dstart) * (hend - hstart) * (wend - wstart)}
 
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
             the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format(str, optional): The data format of the input and output data. An optional string
             from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
             the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64\.
+        - output(Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveAvgPool3D.
@@ -881,58 +898,59 @@ class AdaptiveMaxPool1D(layers.Layer):
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
+        lstart &= floor(i * L_{in} / L_{out})
 
-       lend &= ceil((i + 1) * L_{in} / L_{out})
+        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= max(Input[lstart:lend])
+        Output(i) &= max(Input[lstart:lend])
 
-    Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-             it must contain one int.
-        return_mask (bool): If true, the index of max pooling point will be returned along
+    Parameters:
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along
             with outputs. It cannot be set in average pooling type. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Returns:
-        None.
+        A callable object of AdaptiveMaxPool1D.
 
     Raises:
         ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
 
     Shape:
-        x (Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          # max adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = max(input[:, :, lstart: lend])
-          #
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
-          pool_out = AdaptiveMaxPool1D(data)
-          # pool_out shape: [1, 3, 16]
-
-          # for return_mask = true
-          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
-          pool_out, indices = AdaptiveMaxPool1D(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+            # max adaptive pool1d
+            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            # output shape is [N, C, m], adaptive pool divide L dimension
+            # of input data into m grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         lstart = floor(i * L / m)
+            #         lend = ceil((i + 1) * L / m)
+            #         output[:, :, i] = max(input[:, :, lstart: lend])
+            #
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
+            pool_out = AdaptiveMaxPool1D(data)
+            # pool_out shape: [1, 3, 16]
+
+            # for return_mask = true
+            AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
+            pool_out, indices = AdaptiveMaxPool1D(data)
+            # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
@@ -954,31 +972,36 @@ def extra_repr(self):
 class AdaptiveMaxPool2D(layers.Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and
+    pooling is adaptive one focus on the output size.
 
     For adaptive max pool2d:
 
     ..  math::
 
-       hstart &= floor(i * H_{in} / H_{out})
+        hstart &= floor(i * H_{in} / H_{out})
 
-       hend &= ceil((i + 1) * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out})
 
-       wstart &= floor(j * W_{in} / W_{out})
+        wstart &= floor(j * W_{in} / W_{out})
 
-       wend &= ceil((j + 1) * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out})
 
-       Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
+        Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain
+            two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of
+            the input.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs.
+            It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveMaxPool2D.
@@ -1029,36 +1052,42 @@ def extra_repr(self):
 
 class AdaptiveMaxPool3D(layers.Layer):
     """
-    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions of the output tensor are
+    determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus
+    on the output size.
 
     For adaptive max pool3d:
 
     ..  math::
 
-      dstart &= floor(i * D_{in} / D_{out})
+        dstart &= floor(i * D_{in} / D_{out})
 
-      dend &= ceil((i + 1) * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out})
 
-      hstart &= floor(j * H_{in} / H_{out})
+        hstart &= floor(j * H_{in} / H_{out})
 
-      hend &= ceil((j + 1) * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out})
 
-      wstart &= floor(k * W_{in} / W_{out})
+        wstart &= floor(k * W_{in} / W_{out})
 
-      wend &= ceil((k + 1) * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out})
 
-      Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+        Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain
+            three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as
+            that of the input.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs.
+            Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor.
+          The data type is same as input x.
+
     Returns:
         A callable object of AdaptiveMaxPool3D.
     Examples:
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index a899f18f521e8e..0cefb89340a7c0 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -1053,7 +1053,8 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
                 initial_states,
                 paddle.fluid.framework.Variable) else initial_states
 
-        if self.could_use_cudnn:
+        if self.could_use_cudnn and (not fluid.core.is_compiled_with_rocm() or
+                                     sequence_length is None):
             # Add CPU kernel and dispatch in backend later
             return self._cudnn_impl(inputs, initial_states, sequence_length)
 
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
old mode 100644
new mode 100755
index 0806d2c29148f7..f917b4fa09a507
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -106,6 +106,7 @@ def fc(x,
         weight_attr (ParamAttr, optional): The attribute for the learnable weight.
             The default value is None, and the weight will be initialized to zero.
             For detailed information, please refer to :attr:`paddle.ParamAttr`.
+            Warning, if x is a list of tensor, weight_attr should also be a list of same length.
         bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
             If it is set to False, no bias will be added to the output.
             If it is set to None or one kind of ParamAttr, a bias parameter will
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 056a0226723ca1..69ee2962303831 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -168,7 +168,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             data = data.astype(default_type)
 
     if dtype and convert_dtype(dtype) != data.dtype:
-        data = data.astype(dtype)
+        data = data.astype(convert_dtype(dtype))
 
     return paddle.Tensor(
         value=data,
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9bcda74d116892..696775434b9671 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -212,7 +212,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
 
     Args:
         x (Tensor): A tensor of number of dimentions >= axis. A tensor with data type float32,
-                      float64, int8, int32, int64.
+                      float64, int8, int32, int64, uint8.
         start_axis (int): the start axis to flatten
         stop_axis (int): the stop axis to flatten
         name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
@@ -249,7 +249,8 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
         raise ValueError("The input x should be a Tensor")
 
     check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten')
+        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
+        'flatten')
     helper = LayerHelper('flatten', **locals())
 
     x_dim = len(x.shape)
@@ -1431,7 +1432,8 @@ def expand(x, shape, name=None):
                     'Elements in shape must be 1-D Tensors or integers.')
 
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand')
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError("When the data type of input 'x' for expand is bool, "
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index ba7ca417382e26..5aca87c1507062 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -135,6 +135,9 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
 
     """
 
+    assert core.is_compiled_with_rocm() == False, (
+        "multinomial op is not supported on ROCM yet.")
+
     if in_dygraph_mode():
         return core.ops.multinomial(x, 'num_samples', num_samples,
                                     'replacement', replacement)
diff --git a/python/paddle/tests/dist_hapi_pure_fp16_static.py b/python/paddle/tests/dist_hapi_pure_fp16_static.py
new file mode 100644
index 00000000000000..0174e4f54e3416
--- /dev/null
+++ b/python/paddle/tests/dist_hapi_pure_fp16_static.py
@@ -0,0 +1,60 @@
+# copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import fluid
+
+from paddle import Model
+from paddle.static import InputSpec
+from paddle.nn.layer.loss import CrossEntropyLoss
+from paddle.vision.models import LeNet
+
+
+@unittest.skipIf(not fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestDistTraningWithPureFP16(unittest.TestCase):
+    def test_amp_training_purefp16(self):
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
+        label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
+
+        paddle.enable_static()
+        paddle.set_device('gpu')
+        net = LeNet()
+        amp_level = "O2"
+        inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+        labels = InputSpec([None, 1], "int64", "y")
+        model = Model(net, inputs, labels)
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001,
+            parameters=model.parameters(),
+            multi_precision=True)
+        amp_configs = {"level": amp_level, "use_fp16_guard": False}
+        model.prepare(
+            optimizer=optim,
+            loss=CrossEntropyLoss(reduction="sum"),
+            amp_configs=amp_configs)
+        model.train_batch([data], [label])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index db5b63c5ae0e29..16788e4656192e 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -129,6 +129,9 @@ def test_hapi_multiple_gpus_static(self):
     def test_hapi_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py')
 
+    def test_hapi_amp_static(self):
+        self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/tests/test_hapi_amp.py b/python/paddle/tests/test_hapi_amp.py
new file mode 100644
index 00000000000000..ecab4db7516d75
--- /dev/null
+++ b/python/paddle/tests/test_hapi_amp.py
@@ -0,0 +1,115 @@
+# copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import fluid
+
+from paddle import Model
+from paddle.static import InputSpec
+from paddle.nn.layer.loss import CrossEntropyLoss
+from paddle.vision.models import LeNet
+
+
+@unittest.skipIf(not fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestDistTraningUsingAMP(unittest.TestCase):
+    def test_amp_training(self):
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
+        label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
+        amp_level = "O1"
+        for dynamic in [True, False]:
+            if not fluid.is_compiled_with_cuda():
+                self.skipTest('module not tested when ONLY_CPU compling')
+            paddle.enable_static() if not dynamic else None
+            paddle.set_device('gpu')
+            net = LeNet()
+            inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+            labels = InputSpec([None, 1], "int64", "y")
+            model = Model(net, inputs, labels)
+            optim = paddle.optimizer.Adam(
+                learning_rate=0.001, parameters=model.parameters())
+            amp_configs = {"level": amp_level}
+            model.prepare(
+                optimizer=optim,
+                loss=CrossEntropyLoss(reduction="sum"),
+                amp_configs=amp_configs)
+            model.train_batch([data], [label])
+
+    def test_dynamic_check_input(self):
+        paddle.disable_static()
+        amp_configs_list = [
+            {
+                "level": "O3"
+            },
+            {
+                "level": "O1",
+                "test": 0
+            },
+            {
+                "level": "O1",
+                "use_fp16_guard": True
+            },
+            "O3",
+        ]
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        paddle.set_device('gpu')
+        net = LeNet()
+        model = Model(net)
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        loss = CrossEntropyLoss(reduction="sum")
+        with self.assertRaises(ValueError):
+            for amp_configs in amp_configs_list:
+                model.prepare(
+                    optimizer=optim, loss=loss, amp_configs=amp_configs)
+        model.prepare(optimizer=optim, loss=loss, amp_configs="O2")
+        model.prepare(
+            optimizer=optim,
+            loss=loss,
+            amp_configs={
+                "custom_white_list": {"matmul"},
+                "init_loss_scaling": 1.0
+            })
+
+    def test_static_check_input(self):
+        paddle.enable_static()
+        amp_configs = {"level": "O2", "use_pure_fp16": True}
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        paddle.set_device('gpu')
+
+        net = LeNet()
+        inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+        labels = InputSpec([None, 1], "int64", "y")
+        model = Model(net, inputs, labels)
+
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        loss = CrossEntropyLoss(reduction="sum")
+        with self.assertRaises(ValueError):
+            model.prepare(optimizer=optim, loss=loss, amp_configs=amp_configs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index af54b046fe699f..10ceb487969038 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -622,6 +622,8 @@ def test_export_deploy_model(self):
             paddle.enable_static()
 
     def test_dygraph_export_deploy_model_about_inputs(self):
+        self.set_seed()
+        np.random.seed(201)
         mnist_data = MnistDataset(mode='train')
         paddle.disable_static()
         # without inputs
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 1db1b66426c831..d32fa4c88c4fee 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -20,7 +20,6 @@
 from .op_version import OpLastCheckpointChecker
 from .install_check import run_check
 from ..fluid.framework import unique_name
-from ..fluid.framework import load_op_library
 from ..fluid.framework import require_version
 
 from . import download
@@ -30,4 +29,4 @@
 __all__ = ['dump_config', 'deprecated', 'download', 'run_check']
 
 #TODO: define new api under this directory
-__all__ += ['unique_name', 'load_op_library', 'require_version']
+__all__ += ['unique_name', 'require_version']
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index ea4c85e20db764..ab528cdb0c0d90 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -22,14 +22,15 @@
 from setuptools.command.build_ext import build_ext
 from distutils.command.build import build
 
-from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension_kwargs, add_compile_flag
+from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension_kwargs, add_compile_flag, run_cmd
 from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
 from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
 from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
-from .extension_utils import use_new_custom_op_load_method, clean_object_if_change_cflags
+from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath
 from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat
 
 from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
+from .extension_utils import CLANG_COMPILE_FLAGS, CLANG_LINK_FLAGS
 
 from ...fluid import core
 
@@ -50,14 +51,14 @@
 def setup(**attr):
     """
     The interface is used to config the process of compiling customized operators,
-    mainly includes how to complile shared library, automatically generate python API 
+    mainly includes how to compile shared library, automatically generate python API 
     and install it into site-package. It supports using customized operators directly with
     ``import`` statement.
 
     It encapsulates the python built-in ``setuptools.setup`` function and keeps arguments
     and usage same as the native interface. Meanwhile, it hiddens Paddle inner framework
     concepts, such as necessary compiling flags, included paths of head files, and linking
-    flags. It also will automatically search and valid local enviromment and versions of 
+    flags. It also will automatically search and valid local environment and versions of 
     ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators 
     supporting CPU or GPU device according to the specified Extension type.
 
@@ -67,14 +68,14 @@ def setup(**attr):
 
     For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
     then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it shoule be greater than or equal to that of 
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
     PaddlePaddle (Visual Studio 2015 update3). 
     If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
     occur because of ABI compatibility.
 
     .. note::
         
-        1. Currently we support Linux and Windows platfrom. MacOS is supporting...
+        1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
            Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
            GCC version.
@@ -130,7 +131,7 @@ def setup(**attr):
         ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al. 
                                 If only compile operator supporting CPU device, please use ``CppExtension`` ; If compile operator
                                 supporting CPU and GPU devices, please use ``CUDAExtension`` .
-        include_dirs(list[str], optional): Specify the extra include directoies to search head files. The interface will automatically add
+        include_dirs(list[str], optional): Specify the extra include directories to search head files. The interface will automatically add
                                  ``site-package/paddle/include`` . Please add the corresponding directory path if including third-party
                                  head files. Default is None.
         extra_compile_args(list[str] | dict, optional): Specify the extra compiling flags such as ``-O3`` . If set ``list[str]`` , all these flags
@@ -158,7 +159,7 @@ def setup(**attr):
         setup(name='custom_module',
               ext_modules=CUDAExtension(
               sources=['relu_op.cc', 'relu_op.cu'])
-        
+
         # After running `python setup.py install`
         from custom_module import relu
     """
@@ -209,7 +210,7 @@ def CppExtension(sources, *args, **kwargs):
     Op Kernel only supporting CPU device. Please use ``CUDAExtension`` if you want to
     compile Op Kernel that supports both CPU and GPU devices.
 
-    It furtherly encapsulates python built-in ``setuptools.Extension`` .The arguments and
+    It further encapsulates python built-in ``setuptools.Extension`` .The arguments and
     usage are same as the native interface, except for no need to explicitly specify
     ``name`` .
 
@@ -259,7 +260,7 @@ def CUDAExtension(sources, *args, **kwargs):
     Op Kernel supporting both CPU and GPU devices. Please use ``CppExtension`` if you want to
     compile Op Kernel that supports only CPU device.
 
-    It furtherly encapsulates python built-in ``setuptools.Extension`` .The arguments and
+    It further encapsulates python built-in ``setuptools.Extension`` .The arguments and
     usage are same as the native interface, except for no need to explicitly specify
     ``name`` .
 
@@ -367,11 +368,14 @@ def finalize_options(self):
             self.build_lib = self.output_dir
 
     def build_extensions(self):
+        if OS_NAME.startswith("darwin"):
+            self._valid_clang_compiler()
+
         self._check_abi()
 
         # Note(Aurelius84): If already compiling source before, we should check whether
         # cflags have changed and delete the built shared library to re-compile the source
-        # even though source file content keep unchanaged.
+        # even though source file content keep unchanged.
         so_name = self.get_ext_fullpath(self.extensions[0].name)
         clean_object_if_change_cflags(
             os.path.abspath(so_name), self.extensions[0])
@@ -397,17 +401,21 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
             cflags = copy.deepcopy(extra_postargs)
             try:
                 original_compiler = self.compiler.compiler_so
-                # ncvv compile CUDA source
+                # nvcc compile CUDA source
                 if is_cuda_file(src):
                     if core.is_compiled_with_rocm():
-                        assert ROCM_HOME is not None, "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it."
+                        assert ROCM_HOME is not None, "Not found ROCM runtime, \
+                            please use `export ROCM_PATH= XXX` to specify it."
+
                         hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc')
                         self.compiler.set_executable('compiler_so', hipcc_cmd)
                         # {'nvcc': {}, 'cxx: {}}
                         if isinstance(cflags, dict):
                             cflags = cflags['hipcc']
                     else:
-                        assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
+                        assert CUDA_HOME is not None, "Not found CUDA runtime, \
+                            please use `export CUDA_HOME= XXX` to specify it."
+
                         nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
                         self.compiler.set_executable('compiler_so', nvcc_cmd)
                         # {'nvcc': {}, 'cxx: {}}
@@ -424,7 +432,7 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                 original_compile(obj, src, ext, cc_args, cflags, pp_opts)
             finally:
                 # restore original_compiler
-                self.compiler.compiler_so = original_compiler
+                self.compiler.set_executable('compiler_so', original_compiler)
 
         def win_custom_single_compiler(sources,
                                        output_dir=None,
@@ -470,7 +478,9 @@ def win_custom_spawn(cmd):
                 src = src_list[0]
                 obj = obj_list[0]
                 if is_cuda_file(src):
-                    assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
+                    assert CUDA_HOME is not None, "Not found CUDA runtime, \
+                        please use `export CUDA_HOME= XXX` to specify it."
+
                     nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
                     if isinstance(self.cflags, dict):
                         cflags = self.cflags['nvcc']
@@ -548,22 +558,42 @@ def wrapper(source_filenames, strip_dir=0, output_dir=''):
         print("Compiling user custom op, it will cost a few seconds.....")
         build_ext.build_extensions(self)
 
+        # Reset runtime library path on MacOS platform
+        so_path = self.get_ext_fullpath(self.extensions[0]._full_name)
+        _reset_so_rpath(so_path)
+
     def get_ext_filename(self, fullname):
         # for example: custommed_extension.cpython-37m-x86_64-linux-gnu.so
         ext_name = super(BuildExtension, self).get_ext_filename(fullname)
+        split_str = '.'
+        name_items = ext_name.split(split_str)
         if self.no_python_abi_suffix and six.PY3:
-            split_str = '.'
-            name_items = ext_name.split(split_str)
             assert len(
                 name_items
             ) > 2, "Expected len(name_items) > 2, but received {}".format(
                 len(name_items))
             name_items.pop(-2)
-            # custommed_extension.so
             ext_name = split_str.join(name_items)
 
+        # custommed_extension.dylib
+        if OS_NAME.startswith('darwin'):
+            name_items[-1] = 'dylib'
+            ext_name = split_str.join(name_items)
         return ext_name
 
+    def _valid_clang_compiler(self):
+        """
+        Make sure to use Clang as compiler on Mac platform
+        """
+        compiler_infos = ['clang'] + CLANG_COMPILE_FLAGS
+        linker_infos = ['clang'] + CLANG_LINK_FLAGS
+        self.compiler.set_executables(
+            compiler=compiler_infos,
+            compiler_so=compiler_infos,
+            compiler_cxx=['clang'],
+            linker_exe=['clang'],
+            linker_so=linker_infos)
+
     def _check_abi(self):
         """
         Check ABI Compatibility.
@@ -628,6 +658,8 @@ def run(self, *args, **kwargs):
             will_rename = False
             if OS_NAME.startswith('linux') and ext == '.so':
                 will_rename = True
+            elif OS_NAME.startswith('darwin') and ext == '.dylib':
+                will_rename = True
             elif IS_WINDOWS and ext == '.pyd':
                 will_rename = True
 
@@ -702,7 +734,7 @@ def load(name,
 
     For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
     then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it shoule be greater than or equal to that of 
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
     PaddlePaddle (Visual Studio 2015 update3). 
     If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
     occur because of ABI compatibility.
@@ -713,7 +745,7 @@ def load(name,
 
     .. note::
 
-        1. Currently we support Linux and Windows platfrom. MacOS is supporting...
+        1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
            Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
            GCC version.
@@ -729,7 +761,7 @@ def load(name,
 
         custom_op_module = load(
             name="op_shared_libary_name",                # name of shared library
-            sources=['relu_op.cc', 'relu_op.cu'],        # source files of cusomized op
+            sources=['relu_op.cc', 'relu_op.cu'],        # source files of customized op
             extra_cxx_cflags=['-g', '-w'],               # optional, specify extra flags to compile .cc/.cpp file
             extra_cuda_cflags=['-O2'],                   # optional, specify extra flags to compile .cu file
             verbose=True                                 # optional, specify to output log information
@@ -761,7 +793,7 @@ def load(name,
         verbose(bool, optional): whether to verbose compiled log information. Default is False
 
     Returns:
-        Moudle: A callable python module contains all CustomOp Layer APIs.
+        Module: A callable python module contains all CustomOp Layer APIs.
 
     """
 
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 7d6bcc4d564c93..30ff3f81ca7af0 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -28,7 +28,6 @@
 from contextlib import contextmanager
 from setuptools.command import bdist_egg
 
-from .. import load_op_library
 from ...fluid import core
 from ...fluid.framework import OpProtoHolder
 from ...sysconfig import get_include, get_lib
@@ -45,6 +44,13 @@
     '/wd4190', '/EHsc', '/w', '/DGOOGLE_GLOG_DLL_DECL',
     '/DBOOST_HAS_STATIC_ASSERT', '/DNDEBUG', '/DPADDLE_USE_DSO'
 ]
+CLANG_COMPILE_FLAGS = [
+    '-fno-common', '-dynamic', '-DNDEBUG', '-g', '-fwrapv', '-O3', '-arch',
+    'x86_64'
+]
+CLANG_LINK_FLAGS = [
+    '-dynamiclib', '-undefined', 'dynamic_lookup', '-arch', 'x86_64'
+]
 
 MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_custom_op.lib']
 
@@ -86,7 +92,6 @@
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 '''
-USING_NEW_CUSTOM_OP_LOAD_METHOD = True
 
 DEFAULT_OP_ATTR_NAMES = [
     core.op_proto_and_checker_maker.kOpRoleAttrName(),
@@ -97,18 +102,6 @@
 ]
 
 
-# NOTE(chenweihang): In order to be compatible with
-# the two custom op define method, after removing
-# old method, we can remove them together
-def use_new_custom_op_load_method(*args):
-    global USING_NEW_CUSTOM_OP_LOAD_METHOD
-    if len(args) == 0:
-        return USING_NEW_CUSTOM_OP_LOAD_METHOD
-    else:
-        assert len(args) == 1 and isinstance(args[0], bool)
-        USING_NEW_CUSTOM_OP_LOAD_METHOD = args[0]
-
-
 @contextmanager
 def bootstrap_context():
     """
@@ -122,10 +115,7 @@ def bootstrap_context():
 
 
 def load_op_meta_info_and_register_op(lib_filename):
-    if USING_NEW_CUSTOM_OP_LOAD_METHOD:
-        core.load_op_meta_info_and_register_op(lib_filename)
-    else:
-        core.load_op_library(lib_filename)
+    core.load_op_meta_info_and_register_op(lib_filename)
     return OpProtoHolder.instance().update_op_proto()
 
 
@@ -264,7 +254,7 @@ def details(self):
 def combine_hash(md5, value):
     """
     Return new hash value.
-    DO NOT use `hash()` beacuse it doesn't generate stable value between different process.
+    DO NOT use `hash()` because it doesn't generate stable value between different process.
     See https://stackoverflow.com/questions/27522626/hash-function-in-python-3-3-returns-different-results-between-sessions
     """
     md5.update(repr(value).encode())
@@ -303,13 +293,13 @@ def deserialize(path):
     if os.path.exists(so_path) and os.path.exists(version_file):
         old_version_info = deserialize(version_file)
         so_version = old_version_info.get(so_name, None)
-        # delete shared library file if versison is changed to re-compile it.
+        # delete shared library file if version is changed to re-compile it.
         if so_version is not None and so_version != versioner.version:
             log_v(
                 "Re-Compiling {}, because specified cflags have been changed. New signature {} has been saved into {}.".
                 format(so_name, versioner.version, version_file))
             os.remove(so_path)
-            # upate new version information
+            # update new version information
             new_version_info = versioner.details
             new_version_info[so_name] = versioner.version
             serialize(version_file, new_version_info)
@@ -365,6 +355,54 @@ def get_cuda_arch_flags(cflags):
     return []
 
 
+def _get_fluid_path():
+    """
+    Return installed fluid dir path.
+    """
+    import paddle
+    return os.path.join(os.path.dirname(paddle.__file__), 'fluid')
+
+
+def _get_core_name():
+    """
+    Return pybind DSO module name.
+    """
+    import paddle
+    if paddle.fluid.core.load_noavx:
+        return 'core_noavx.so'
+    else:
+        return 'core_avx.so'
+
+
+def _get_lib_core_path():
+    """
+    Return real path of libcore_(no)avx.dylib on MacOS.
+    """
+    raw_core_name = _get_core_name()
+    lib_core_name = "lib{}.dylib".format(raw_core_name[:-3])
+    return os.path.join(_get_fluid_path(), lib_core_name)
+
+
+def _reset_so_rpath(so_path):
+    """
+    NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs`
+    in setup.py.in. While loading custom op, `@loader_path` is the dirname of custom op
+    instead of `paddle/fluid`. So we modify `@loader_path` from custom dylib into `@rpath`
+    to ensure dynamic loader find it correctly.
+
+    Moreover, we will add `-rpath site-packages/paddle/fluid` while linking the dylib so
+    that we don't need to set `LD_LIBRARY_PATH` any more.
+    """
+    assert os.path.exists(so_path)
+    if OS_NAME.startswith("darwin"):
+        origin_runtime_path = "@loader_path/../libs/"
+        rpath = "@rpath/{}".format(_get_core_name())
+        cmd = 'install_name_tool -change {} {} {}'.format(origin_runtime_path,
+                                                          rpath, so_path)
+
+        run_cmd(cmd)
+
+
 def normalize_extension_kwargs(kwargs, use_cuda=False):
     """
     Normalize include_dirs, library_dir and other attributes in kwargs.
@@ -398,18 +436,28 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
             extra_link_args.extend(['cudadevrt.lib', 'cudart_static.lib'])
         kwargs['extra_link_args'] = extra_link_args
     else:
+        ########################### Linux Platform ###########################
+        extra_link_args = kwargs.get('extra_link_args', [])
+        # On Linux, GCC support '-l:xxx.so' to specify the library name
+        # without `lib` prefix.
+        if OS_NAME.startswith('linux'):
+            extra_link_args.append('-l:{}'.format(_get_core_name()))
+        ########################### MacOS Platform ###########################
+        else:
+            # See _reset_so_rpath for details.
+            extra_link_args.append('-Wl,-rpath,{}'.format(_get_fluid_path()))
+            # On MacOS, ld don't support `-l:xx`, so we create a
+            # libcore_avx.dylib symbol link.
+            lib_core_name = create_sym_link_if_not_exist()
+            extra_link_args.append('-l{}'.format(lib_core_name))
+        ###########################   -- END --    ###########################
+
         add_compile_flag(extra_compile_args, ['-w'])  # disable warning
         # Note(Aurelius84): This marco will impact memory layout of `Tensor`.
-        # We align it automatially with pre-installed Paddle.
+        # We align it automatically with pre-installed Paddle.
         if core.is_compiled_with_mkldnn():
             add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_MKLDNN'])
 
-        # append link flags
-        extra_link_args = kwargs.get('extra_link_args', [])
-        if use_new_custom_op_load_method():
-            extra_link_args.append('-lpaddle_custom_op')
-        else:
-            extra_link_args.append('-lpaddle_framework')
         if use_cuda:
             extra_link_args.append('-lcudart')
 
@@ -426,6 +474,30 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
     return kwargs
 
 
+def create_sym_link_if_not_exist():
+    """
+    Create soft symbol link of `core_avx.so` or `core_noavx.so`
+    """
+    assert OS_NAME.startswith('darwin')
+
+    raw_core_name = _get_core_name()
+    core_path = os.path.join(_get_fluid_path(), raw_core_name)
+    new_lib_core_path = _get_lib_core_path()
+
+    # create symbol link
+    if not os.path.exists(new_lib_core_path):
+        try:
+            os.symlink(core_path, new_lib_core_path)
+            assert os.path.exists(new_lib_core_path)
+        except Exception:
+            raise RuntimeError(
+                "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
+                format(raw_core_name, core_path, new_lib_core_path))
+
+    # core_avx or core_noavx without suffix
+    return raw_core_name[:-3]
+
+
 def find_cuda_home():
     """
     Use heuristic method to find cuda path
@@ -538,6 +610,11 @@ def find_paddle_includes(use_cuda=False):
             cuda_include_dir = find_cuda_includes()
             include_dirs.extend(cuda_include_dir)
 
+    if OS_NAME.startswith('darwin'):
+        # NOTE(Aurelius84): Ensure to find std v1 headers correctly.
+        std_v1_includes = '/Library/Developer/CommandLineTools/usr/include/c++/v1/'
+        include_dirs.append(std_v1_includes)
+
     return include_dirs
 
 
@@ -587,6 +664,9 @@ def find_paddle_libraries(use_cuda=False):
             cuda_lib_dir = find_cuda_libraries()
             paddle_lib_dirs.extend(cuda_lib_dir)
 
+    # add `paddle/fluid` to search `core_avx.so` or `core_noavx.so`
+    paddle_lib_dirs.append(_get_fluid_path())
+
     return paddle_lib_dirs
 
 
@@ -634,9 +714,6 @@ def get_build_directory(verbose=False):
         if IS_WINDOWS:
             root_extensions_directory = os.path.normpath(
                 root_extensions_directory)
-        elif OS_NAME.startswith('darwin'):
-            # TODO(Aurelius84): consider macOs
-            raise NotImplementedError("Not support Mac now.")
 
         log_v("$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".
               format(root_extensions_directory), verbose)
@@ -674,6 +751,8 @@ def _import_module_from_library(module_name, build_directory, verbose=False):
     """
     if IS_WINDOWS:
         dynamic_suffix = '.pyd'
+    elif OS_NAME.startswith('darwin'):
+        dynamic_suffix = '.dylib'
     else:
         dynamic_suffix = '.so'
     ext_path = os.path.join(build_directory, module_name + dynamic_suffix)
@@ -714,22 +793,29 @@ def _custom_api_content(op_name):
     params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name)
 
     API_TEMPLATE = textwrap.dedent("""
+        from paddle.fluid.core import VarBase
+        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer
         from paddle.fluid.layer_helper import LayerHelper
 
         def {op_name}({inputs}):
-            helper = LayerHelper("{op_name}", **locals())
-
             # prepare inputs and outputs
             ins = {ins}
             attrs = {attrs}
             outs = {{}}
             out_names = {out_names}
-            for out_name in out_names:
-                # Set 'float32' temporarily, and the actual dtype of output variable will be inferred
-                # in runtime.
-                outs[out_name] = helper.create_variable(dtype='float32')
-            
-            helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+
+            # The output variable's dtype use default value 'float32',
+            # and the actual dtype of output variable will be inferred in runtime.
+            if in_dygraph_mode():
+                for out_name in out_names:
+                    outs[out_name] = VarBase()
+                _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+            else:
+                helper = LayerHelper("{op_name}", **locals())
+                for out_name in out_names:
+                    outs[out_name] = helper.create_variable(dtype='float32')
+
+                helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
 
             res = [outs[out_name] for out_name in out_names]
 
@@ -777,7 +863,7 @@ def _get_api_inputs_str(op_name):
     # e.g: x, y, z
     param_names = in_names + attr_names
     # NOTE(chenweihang): we add suffix `@VECTOR` for std::vector<Tensor> input,
-    # but the string contains `@` cannot used as argument name, so we split 
+    # but the string contains `@` cannot used as argument name, so we split
     # input name by `@`, and only use first substr as argument
     params_str = ','.join([p.split("@")[0].lower() for p in param_names])
     # e.g: {'X': x, 'Y': y, 'Z': z}
@@ -811,9 +897,7 @@ def _write_setup_file(name,
     import os
     from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup
     from paddle.utils.cpp_extension import get_build_directory
-    from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
 
-    use_new_custom_op_load_method({use_new_method})
 
     setup(
         name='{name}',
@@ -841,8 +925,7 @@ def _write_setup_file(name,
         extra_cxx_cflags=list2str(extra_cxx_cflags),
         extra_cuda_cflags=list2str(extra_cuda_cflags),
         extra_link_args=list2str(link_args),
-        build_dir=build_dir,
-        use_new_method=use_new_custom_op_load_method())
+        build_dir=build_dir)
 
     log_v('write setup.py into {}'.format(file_path), verbose)
     with open(file_path, 'w') as f:
@@ -898,11 +981,7 @@ def parse_op_name_from(sources):
     """
 
     def regex(content):
-        if USING_NEW_CUSTOM_OP_LOAD_METHOD:
-            pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)')
-        else:
-            pattern = re.compile(r'REGISTER_OPERATOR\(([^,]+),')
-
+        pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)')
         content = re.sub(r'\s|\t|\n', '', content)
         op_name = pattern.findall(content)
         op_name = set([re.sub('_grad', '', name) for name in op_name])
diff --git a/python/requirements.txt b/python/requirements.txt
index e89b3ede94fd4a..609a4b34e8f1ae 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -7,5 +7,5 @@ gast>=0.3.3 ; platform_system != "Windows"
 gast==0.3.3 ; platform_system == "Windows"
 Pillow
 six
-decorator
+decorator==4.4.2
 astor
diff --git a/python/setup.py.in b/python/setup.py.in
index 69a8bc771aefb0..c366415ebb21a1 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -149,12 +149,15 @@ packages=['paddle',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.meta_optimizers',
           'paddle.distributed.fleet.meta_optimizers.sharding',
+          'paddle.distributed.fleet.meta_optimizers.ascend',
           'paddle.distributed.fleet.runtime',
           'paddle.distributed.fleet.dataset',
           'paddle.distributed.fleet.data_generator',
           'paddle.distributed.fleet.metrics',
           'paddle.distributed.fleet.proto',
           'paddle.distributed.fleet.utils',
+          'paddle.distributed.fleet.meta_parallel',
+          'paddle.distributed.fleet.meta_parallel.mp_utils',
           'paddle.framework',
           'paddle.jit',
           'paddle.jit.dy2static',
@@ -216,6 +219,7 @@ packages=['paddle',
           'paddle.static.amp',
           'paddle.tensor',
           'paddle.onnx',
+          'paddle.autograd',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -347,17 +351,8 @@ if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '':
         shutil.copy(xpu_rt_lib, libs_path)
         package_data['paddle.libs']+=['libxpurt.so']
 
-### Old custom op extension mechanism related, will be removed in 2.1.0 ###
-# copy libpaddle_framework.so to libs on linux
-if sys.platform.startswith('linux'):
-    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['libpaddle_framework.so']
 
 ### New custom op extension mechanism related ###
-# copy libpaddle_custom_op.so to libs on linux
-if sys.platform.startswith('linux'):
-    shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['libpaddle_custom_op.so']
 
 # copy paddle_custom_op.lib/paddle_custom_op.dll to libs on Windows
 if os.name == 'nt':
@@ -405,25 +400,8 @@ def find_files(pattern, root):
 
 headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
-    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
-    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
-    list(find_files('*', '${GFLAGS_INSTALL_DIR}/include')) + # gflags
-    list(find_files('*', '${GLOG_INSTALL_DIR}/include')) + # glog
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')) + # boost
-    list(find_files('*', '${XXHASH_INSTALL_DIR}/include')) + # xxhash
-    list(find_files('*', '${PROTOBUF_INCLUDE_DIR}')) + # protobuf
-    list(find_files('*', '${DLPACK_INCLUDE_DIR}')) + # dlpack
-    list(find_files('*.h', '${THREADPOOL_INCLUDE_DIR}'))) # threadpool
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +  # extension
+    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')))  # boost
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -463,17 +441,18 @@ class InstallHeaders(Command):
                                    ('install_headers', 'install_dir'),
                                    ('force', 'force'))
 
-    def copy_data_type_headers(self, header):
-        if os.name == 'nt':
-            data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h', 'platform\\float16.h']
-        else:
-            data_type_headers = ['platform/complex64.h', 'platform/complex128.h', 'platform/float16.h']
-        for dtype_header in data_type_headers:
-            if dtype_header in header:
-                install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
-                if not os.path.exists(install_dir):
-                    self.mkpath(install_dir)
-                return self.copy_file(header, install_dir)
+    def copy_data_type_headers(self):
+        # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
+        # to `extension/incude`,
+        data_type_headers = (['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] + 
+                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] + 
+                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
+
+        install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
+        if not os.path.exists(install_dir):
+            self.mkpath(install_dir)
+        for header in data_type_headers:
+            self.copy_file(header, install_dir)
 
     def mkdir_and_copy_file(self, header):
         if 'pb.h' in header:
@@ -481,9 +460,6 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-            # For paddle data type headers, we also need to copy to `extension/incude`,
-            # used for new custom operator
-            self.copy_data_type_headers(header)
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
@@ -509,6 +485,7 @@ class InstallHeaders(Command):
         for header in hdrs:
             (out, _) = self.mkdir_and_copy_file(header)
             self.outfiles.append(out)
+        self.copy_data_type_headers()
 
     def get_inputs(self):
         return self.distribution.headers or []
@@ -534,10 +511,10 @@ else:
 
 # Log for PYPI
 if sys.version_info > (3,0):
-    with open("@PADDLE_BINARY_DIR@/python/paddle/README.md", "r", encoding='UTF-8') as f:
+    with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r", encoding='UTF-8') as f:
         long_description = f.read()
 else:
-    with open("@PADDLE_BINARY_DIR@/python/paddle/README.md", "r")as f:
+    with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r")as f:
         long_description = unicode(f.read(), 'UTF-8')
 
 with redirect_stdout():
diff --git a/scripts/paddle b/scripts/paddle
new file mode 100644
index 00000000000000..5f256ccf157910
--- /dev/null
+++ b/scripts/paddle
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+function version(){
+        echo "PaddlePaddle , compiled with"
+        echo "    with_avx: ON"
+        echo "    with_gpu: OFF"
+        echo "    with_mkl: ON"
+        echo "    with_mkldnn: "
+        echo "    with_python: ON"
+}
+
+function ver2num() {
+  set -e
+  # convert version to number.
+  if [ -z "$1" ]; then # empty argument
+    printf "%03d%03d%03d%03d%03d" 0
+  else
+    local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \
+        | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g')
+    if [ `echo $VERN | wc -w` -eq 3 ] ; then
+      printf "%03d%03d%03d%03d%03d" $VERN 999 999
+    else
+      printf "%03d%03d%03d%03d%03d" $VERN
+    fi
+  fi
+  set +e
+}
+
+function cpu_config() {
+  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
+  # only when MKL enabled
+  if [ "ON" == "OFF" ]; then
+    return 0
+  fi
+  platform="`uname -s`"
+  ht=0
+  if [ $platform == "Linux" ]; then
+    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  elif [ $platform == "Darwin" ]; then
+    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
+      # HT is OFF
+      ht=1
+    fi
+  else
+    return 0
+  fi
+  if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="FALSE"
+    fi
+  else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="True"
+    fi
+  fi
+}
+
+function threads_config() {
+  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
+  # according to trainer_count and total processors
+  # only when MKL enabled
+  # auto set OPENBLAS_NUM_THREADS when do not use MKL
+  platform="`uname -s`"
+  processors=0
+  if [ $platform == "Linux" ]; then
+    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  elif [ $platform == "Darwin" ]; then
+    processors=`sysctl -n hw.logicalcpu`
+  else
+    return 0
+  fi
+  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
+  if [ -z $trainers ]; then
+    trainers=1
+  fi
+  threads=$((processors / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  if [ "ON" == "ON" ]; then
+    if [ -z "$OMP_NUM_THREADS" ]; then
+      export OMP_NUM_THREADS=$threads
+    fi
+    if [ -z "$MKL_NUM_THREADS" ]; then
+      export MKL_NUM_THREADS=$threads
+    fi
+  else
+    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
+      export OPENBLAS_NUM_THREADS=$threads
+    fi
+    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
+      export OPENBLAS_MAIN_FREE=1
+    fi
+  fi
+  
+}
+
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+mkdir -p ${PADDLE_CONF_HOME}
+
+if [ -z "${PADDLE_NO_STAT+x}" ]; then
+    SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"\" }"\
+        -b ${PADDLE_CONF_HOME}/paddle.cookie \
+        -c ${PADDLE_CONF_HOME}/paddle.cookie \
+        http://api.paddlepaddle.org/version 2>/dev/null`
+    if [ $? -eq 0 ] && [ "$(ver2num )" -lt  $(ver2num $SERVER_VER) ]; then
+      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org"
+    fi
+fi
+
+PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+if [ ! -z "${DEBUGGER}" ]; then
+    echo "Using debug command ${DEBUGGER}"
+fi
+
+CUDNN_LIB_PATH=""
+
+if [ ! -z "${CUDNN_LIB_PATH}" ]; then
+    export LD_LIBRARY_PATH=${CUDNN_LIB_PATH}:${LD_LIBRARY_PATH}
+fi
+
+export PYTHONPATH=${PWD}:${PYTHONPATH}
+
+
+# Check python lib installed or not.
+pip --help > /dev/null
+if [ $? -ne 0 ]; then
+    echo "pip should be installed to run paddle."
+    exit 1
+fi
+
+if [ "OFF" == "ON" ]; then
+    PADDLE_NAME="paddlepaddle-gpu"
+else 
+    PADDLE_NAME="paddlepaddle"
+fi
+
+INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
+
+if [ -z "${INSTALLED_VERSION}" ]; then
+   INSTALLED_VERSION="0.0.0"  # not installed
+fi
+cat <<EOF | python -
+from distutils.version import LooseVersion
+import sys
+if LooseVersion("${INSTALLED_VERSION}") < LooseVersion(""):
+  sys.exit(1)
+else:
+  sys.exit(0)
+EOF
+
+cpu_config
+# echo $KMP_AFFINITY $OMP_DYNAMIC
+
+case "$1" in
+    "version")
+        version
+        ;;
+    *)
+        version
+        ;;
+ esac
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index 2ea34771d1b38c..7301e9954e910a 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -16,32 +16,59 @@
 
 set +e
 set -x
+SYSTEM=`uname -s`
 if [ -z ${BRANCH} ]; then
     BRANCH="develop"
 fi
 
 export CI_SKIP_CPP_TEST=OFF
-PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
+fi
 CURDIR=`pwd`
 cd $PADDLE_ROOT
-cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    git remote | grep upstream
+    if [ $? != 0 ]; then 
+        git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+        git fetch upstream develop
+    fi
+fi
 CURBRANCH=`git rev-parse --abbrev-ref HEAD`
 echo $CURBRANCH
+if [ `git branch | grep 'prec_added_ut'` ];then
+    git branch -D 'prec_added_ut'
+fi
 git checkout -b prec_added_ut upstream/${BRANCH}
+git branch
 mkdir prec_build
 cd prec_build
-bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' > /$PADDLE_ROOT/br-ut
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1
+fi
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut
 cd $PADDLE_ROOT/build
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' > /$PADDLE_ROOT/pr-ut
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut
 cd $PADDLE_ROOT
 grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut
-sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut
+if [[ "$SYSTEM" == 'Linux' ]];then
+    sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut
+fi
 echo "New-UT:"
 cat $PADDLE_ROOT/added_ut
 rm -rf prec_build
-rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
-git checkout $CURBRANCH
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/get_added_ut.sh
+fi
+git checkout -f $CURBRANCH
 echo $CURBRANCH
 git branch -D prec_added_ut
 cd $CURDIR
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 4e8ea25715451f..eb05468eda6cad 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 if [ -z ${BRANCH} ]; then
     BRANCH="develop"
 fi
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index f3bf3ea508ba71..05466883e58d28 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -53,6 +53,7 @@ API_FILES=("CMakeLists.txt"
            "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
            "tools/wlist.json"
+           "tools/sampcd_processor.py"
            "paddle/scripts/paddle_build.bat"
            "tools/windows/run_unittests.sh"
            "tools/parallel_UT_rule.py"
@@ -79,6 +80,12 @@ function add_failed(){
     echo_list="${echo_list[@]}$1"
 }
 
+function run_test_sampcd_processor() {
+    CUR_PWD=$(pwd)
+    cd ${PADDLE_ROOT}/tools
+    python test_sampcd_processor.py
+    cd ${CUR_PWD}
+}
 
 if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
     echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
@@ -136,6 +143,9 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "tools/wlist.json" ];then
           echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
           check_approval 1 29231
+      elif [ "${API_FILE}" == "tools/sampcd_processor.py" ];then
+          echo_line="test_sampcd_processor.py will be executed for changed sampcd_processor.py.\n"
+          run_test_sampcd_processor
       elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
 	      echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes"
 	      check_approval 1 35824027 38231817
diff --git a/tools/check_sequence_op.sh b/tools/check_sequence_op.sh
index ada96750eaad80..a263b046b258b1 100644
--- a/tools/check_sequence_op.sh
+++ b/tools/check_sequence_op.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
 
 function check_sequnece_op_unitests(){
diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh
index 3c0e57ffe7ec1f..66e56b8485d8c6 100644
--- a/tools/cudaError/start.sh
+++ b/tools/cudaError/start.sh
@@ -1,4 +1,19 @@
 #!/usr/bin/env bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -ex
 SYSTEM=`uname -s`
 rm -f protoc-3.11.3-linux-x86_64.*
diff --git a/tools/diff_api.py b/tools/diff_api.py
index 8a2acbb3d0acc7..f086598945afe4 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 import difflib
 import sys
diff --git a/tools/diff_unittest.py b/tools/diff_unittest.py
index 382fbdd0b0c29f..fa70be0990ec09 100644
--- a/tools/diff_unittest.py
+++ b/tools/diff_unittest.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import difflib
 import sys
 
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 108d2e5705c529..5e87804179f078 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -19,6 +19,7 @@ RUN bash build_scripts/build.sh
 RUN bash build_scripts/install_nccl2.sh 
 RUN bash build_scripts/install_trt.sh 
 RUN rm -rf build_scripts
+RUN ln -s /usr/local/ssl/include/openssl /usr/include
 
 # git 2.17.1
 RUN wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
@@ -47,26 +48,17 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p
 
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index d68992717c5126..2cae7896d6483e 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -56,7 +56,7 @@ RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-a
 # Install Python3.7
 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
     tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7.0 --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
 
 # Install Python3.8
@@ -65,16 +65,11 @@ RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
     CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
 
-# Install Python3.5
-RUN wget -q https://www.python.org/ftp/python/3.5.1/Python-3.5.1.tgz && \
-    tar -xzf Python-3.5.1.tgz && cd Python-3.5.1 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.5.1 --enable-shared > /dev/null && \
-    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
-ENV PATH=/usr/local/python3.5.1/include:${PATH}
-ENV PATH=/usr/local/python3.5.1/bin:${PATH}
-ENV LD_LIBRARY_PATH=/usr/local/python3.5.1/lib:${LD_LIBRARY_PATH}
-ENV CPLUS_INCLUDE_PATH=/usr/local/python3.5.1/include/python3.5:$CPLUS_INCLUDE_PATH
-RUN ln -sf /usr/local/python3.5.1/bin/python3.5 /usr/local/bin/python3 && ln -sf /usr/local/python3.5.1/bin/python3.5 /usr/bin/python3
+ENV PATH=/usr/local/python3.7.0/include:${PATH}
+ENV PATH=/usr/local/python3.7.0/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:${LD_LIBRARY_PATH}
+ENV CPLUS_INCLUDE_PATH=/usr/local/python3.7.0/include/python3.7:$CPLUS_INCLUDE_PATH
+RUN ln -sf /usr/local/python3.7.0/bin/python3.7 /usr/local/bin/python3 && ln -sf /usr/local/python3.7.0/bin/python3.7 /usr/bin/python3
 
 RUN rm -r /root/python_build
 
@@ -97,19 +92,19 @@ RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e44
 WORKDIR /home/setuptools-40.6.2
 RUN python setup.py build && python setup.py install
 WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz && tar -zxvf pip-18.0.tar.gz
-WORKDIR pip-18.0
+RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
+WORKDIR pip-20.0.1
 RUN python setup.py install && \
   python3.8 setup.py install && \
   python3.7 setup.py install && \
-  python3.6 setup.py install && \
-  python3 setup.py install 
+  python3.6 setup.py install
 
 WORKDIR /home
-RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \
-    rm -r Python-$version setuptools-40.6.2 pip-18.0
+RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-20.0.1.tar.gz && \
+    rm -r Python-$version setuptools-40.6.2 pip-20.0.1
 
 # Install Go and glide
+WORKDIR /home
 RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
     tar -xz -C /usr/local && \
     mkdir /root/gopath && \
@@ -143,10 +138,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+RUN pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
@@ -155,37 +147,31 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.8 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.8 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.8 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
-RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+RUN pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0'
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0'
 
 #For docstring checker
-RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
-    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \
     pip3.7 --no-cache-dir install pylint pytest astroid isort && \
     pip3.8 --no-cache-dir install pylint pytest astroid isort && \
-    pip --no-cache-dir install pylint pytest astroid isort LinkChecker
+    pip --no-cache-dir install pylint pytest astroid isort
 
-RUN pip3 --no-cache-dir install coverage && \
-    pip3.6 --no-cache-dir install coverage && \
+RUN pip3.6 --no-cache-dir install coverage && \
     pip3.7 --no-cache-dir install coverage && \
     pip3.8 --no-cache-dir install coverage && \
     pip --no-cache-dir install coverage
 
 COPY ./python/requirements.txt /root/
-RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.7 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
     pip --no-cache-dir install -r /root/requirements.txt
@@ -193,7 +179,9 @@ RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y && \
-    pip3 --no-cache-dir install certifi urllib3[secure] && \
+    pip3.6 install --upgrade pip && \ 
+    pip3.7 install --upgrade pip && \ 
+    pip3.8 install --upgrade pip && \ 
     pip3.6 --no-cache-dir install certifi urllib3[secure] && \
     pip3.7 --no-cache-dir install certifi urllib3[secure] && \
     pip3.8 --no-cache-dir install certifi urllib3[secure] && \
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index 7d5e019443229e..41f6e18f547cce 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -24,15 +24,13 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="3.8.0 3.7.0 3.6.0 3.5.1 2.7.15"
+CPYTHON_VERSIONS="3.8.0 3.7.0 3.6.0"
 
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
 OPENSSL_ROOT=openssl-1.0.2g
 OPENSSL_HASH=b784b1b3907ce39abf4098702dade6365522a253ad1552e267a9a0e89594aa33
 PATCHELF_HASH=f2aa40a6148cb3b0ca807a1bf836b081793e55ec9e5540a5356d800132be7e0a
-CURL_ROOT=curl-7.49.1
-CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
 AUTOCONF_ROOT=autoconf-2.69
 AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 
@@ -79,7 +77,6 @@ build_openssl $OPENSSL_ROOT $OPENSSL_HASH
 mkdir -p /opt/python
 build_cpythons $CPYTHON_VERSIONS
 
-PY35_BIN=/opt/python/cp35-cp35m/bin
 PY36_BIN=/opt/python/cp36-cp36m/bin
 PY37_BIN=/opt/python/cp37-cp37m/bin
 PY38_BIN=/opt/python/cp38-cp38m/bin
@@ -87,25 +84,19 @@ PY38_BIN=/opt/python/cp38-cp38m/bin
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib"
 
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
 # And it's not clear how up-to-date that is anyway
 # So let's just use the same one pip and everyone uses
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
-ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY37_BIN})/lib" $PY37_BIN/pip install certifi
+ln -s $($PY37_BIN/python -c 'import certifi; print(certifi.where())') \
       /opt/_internal/certs.pem
 # If you modify this line you also have to modify the versions in the
 # Dockerfiles:
 export SSL_CERT_FILE=/opt/_internal/certs.pem
 
-# Install newest curl
-build_curl $CURL_ROOT $CURL_HASH
-rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
-hash -r
-curl --version
-curl-config --features
 
 # Install patchelf (latest with unreleased bug fixes)
 # FIXME(typhoonzero): restore this when the link is fixed.
@@ -117,8 +108,8 @@ curl-config --features
 yum install -y patchelf
 
 # Install latest pypi release of auditwheel
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
-ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
+#LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
+#ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
 
 # Clean up development headers and other unnecessary stuff for
 # final image
diff --git a/tools/dockerfile/build_scripts/install_gcc.sh b/tools/dockerfile/build_scripts/install_gcc.sh
index e75021b2a9b653..e744e9ddac66e6 100644
--- a/tools/dockerfile/build_scripts/install_gcc.sh
+++ b/tools/dockerfile/build_scripts/install_gcc.sh
@@ -43,4 +43,18 @@ if [ "$1" == "gcc82" ]; then
   ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
   ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
   cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
+elif [ "$1" == "gcc54" ]; then
+  wget -q http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2 
+  tar -xvf gcc-5.4.0.tar.bz2 && \
+  cd gcc-5.4.0 && \
+  unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+  ./contrib/download_prerequisites && \
+  cd .. && mkdir temp_gcc54 && cd temp_gcc54 && \
+  ../gcc-5.4.0/configure --prefix=/usr/local/gcc-5.4 --enable-checking=release --enable-languages=c,c++ --disable-multilib && \
+  make -j8 && make install
+  cd .. && rm -rf temp_gcc54
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+  ln -s /usr/local/gcc-5.4/lib64/libgfortran.so.5 ${lib_so_5} && \
+  ln -s /usr/local/gcc-5.4/lib64/libstdc++.so.6 ${lib_so_6} && \
+  cp /usr/local/gcc-5.4/lib64/libstdc++.so.6.0.21 ${lib_path}
 fi
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 490bff22826826..5f8a48c8067a5b 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -20,11 +20,15 @@ REPO="${REPO:-paddledocker}"
 
 function make_cuda9cudnn7(){
   sed 's/<baseimg>/9.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+
 }
 
 
 function make_cuda10cudnn7() {
   sed 's/<baseimg>/10.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+
 }
 
 
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 15196e30516ef9..e61a4eb3dbd0c1 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -19,8 +19,14 @@ function make_ubuntu_dockerfile(){
   sed "s/<baseimg>/10.1-cudnn7-devel-ubuntu16.04/g" ./Dockerfile.ubuntu >${dockerfile_name}
   sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
-  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
      tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+  sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
+    tar -xvf git-2.17.1.tar.gz \&\& \
+    cd git-2.17.1 \&\& \
+    ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \
+    make -j8 \&\& make install " ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub " ${dockerfile_name}
   sed -i "s#<install_gcc>#WORKDIR /usr/bin \\
     COPY tools/dockerfile/build_scripts /build_scripts \\
     RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\
@@ -30,9 +36,9 @@ function make_ubuntu_dockerfile(){
     RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
     RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
     ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
-  sed -i "s#bash /build_scripts/install_nccl2.sh#wget --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
+  sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
     RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
-    RUN apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 --allow-change-held-packages #g" ${dockerfile_name}
+    RUN apt update \&\& apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
 }
 
 
@@ -41,12 +47,16 @@ function make_centos_dockerfile(){
   sed "s/<baseimg>/11.0-cudnn8-devel-centos7/g" Dockerfile.centos >${dockerfile_name}
   sed -i "s#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts  ./build_scripts#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
-  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/cc && ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN yum install -y pigz graphviz" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/cc && ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/g++ && ln -s /usr/local/gcc-5.4/bin/g++ /usr/bin/g++" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/c++ && ln -s /usr/local/gcc-5.4/bin/c++ /usr/bin/c++" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/gcc && ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/gcc" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so \\
     RUN ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/" ${dockerfile_name}
   sed -i $"${dockerfile_line}i RUN wget --no-check-certificate -q  https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \\
-    RUN tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name}
+    RUN tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name}
 }
 
 
diff --git a/tools/dockerfile/icode.sh b/tools/dockerfile/icode.sh
index da3ffb8c77db71..973975fe7f7373 100755
--- a/tools/dockerfile/icode.sh
+++ b/tools/dockerfile/icode.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 
 function install_gcc(){
   sed -i 's#<install_gcc>#RUN apt-get update \
diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
index b7d0d8e3e2aac7..23578b4143f8b1 100755
--- a/tools/dockerfile/ubuntu16_dev.sh
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -33,7 +33,7 @@ function ref_whl(){
   fi
 
   if [[ ${WITH_GPU} != "ON" ]]; then
-    ref_gcc = ""
+    ref_gcc=""
   elif [[ ${gcc_version} == "8.2.0" ]];then
     ref_gcc=_gcc8.2
   fi
@@ -44,29 +44,31 @@ function ref_whl(){
       ref_version=.post100
   elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
       ref_version=.post101
-  elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} == "develop" ]];then
+      ref_version=.post102
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} != "develop" ]];then
       ref_version=""
   elif [[ ${ref_CUDA_MAJOR} == "9" ]];then
       ref_version=.post90
   fi
+
+  ref_dev=2.1.0.dev0
   
   ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
   
-  if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  else
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  fi
-  
-  if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+  if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl
@@ -106,7 +108,7 @@ function install_gcc(){
   else
     sed -i 's#<install_gcc>#RUN apt-get update \
       WORKDIR /usr/bin \
-      RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp
+      RUN apt install -y gcc g++ #g' Dockerfile.tmp
   fi
 }
 
diff --git a/tools/dockerfile/ubuntu18_dev.sh b/tools/dockerfile/ubuntu18_dev.sh
index 19572f639bcf5f..6c6a14529ca0e1 100755
--- a/tools/dockerfile/ubuntu18_dev.sh
+++ b/tools/dockerfile/ubuntu18_dev.sh
@@ -33,7 +33,7 @@ function ref_whl(){
   fi
 
   if [[ ${WITH_GPU} != "ON" ]]; then
-    ref_gcc = ""
+    ref_gcc=""
   elif [[ ${gcc_version} == "8.2.0" ]];then
     ref_gcc=_gcc8.2
   fi
@@ -44,29 +44,31 @@ function ref_whl(){
       ref_version=.post100
   elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
       ref_version=.post101
-  elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} == "develop" ]];then
+      ref_version=.post102
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} != "develop" ]];then
       ref_version=""
   elif [[ ${ref_CUDA_MAJOR} == "9" ]];then
       ref_version=.post90
   fi
+
+  ref_dev=2.1.0.dev0
   
   ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
   
-  if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  else
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  fi
-  
-  if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+  if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl
@@ -107,7 +109,7 @@ function install_gcc(){
   else
     sed -i 's#<install_gcc>#RUN apt-get update \
       WORKDIR /usr/bin \
-      RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp
+      RUN apt install -y gcc g++ #g' Dockerfile.tmp
   fi
 }
 
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 10f486f8fd4f63..83c758d0aa8b8f 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 PADDLE_ROOT=/home
 mkdir ${PADDLE_ROOT}
 cd ${PADDLE_ROOT}
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
index 81eb19dc0661e6..bce338a8619e64 100755
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 if [ "`uname -s`" != "Linux" ]; then
   echo "Current scenario only support in Linux yet!"
   exit 0
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 58d7d2c0d6bc70..001f380049f92d 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -265,8 +265,7 @@ def get_pr_ut(self):
                         '.cu'):
                     if f.find('test_') != -1 or f.find('_test') != -1:
                         print('PREC {} need check new ut'.format(f))
-                        if current_system != "Windows":
-                            check_added_ut = True
+                        check_added_ut = True
                     elif self.is_only_comment(f):
                         ut_list.append('nomap_comment_placeholder')
                     else:
diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py
index 18ebdb0031747f..4805c909c1ba41 100644
--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -25,6 +25,13 @@ def download_file():
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_win')
     else:
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut')
+    try:
+        import paddle.fluid.core as core
+        if core.is_compiled_with_rocm():
+            url = "https://sys-p0.bj.bcebos.com/prec/{}".format(
+                'disable_ut_rocm_ci')
+    except:
+        pass
     f = requests.get(url)
     data = f.text
     status_code = f.status_code
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 3fb78b0d0a19ac..0f745f212078fc 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -16,166 +16,148 @@
 import os
 
 # *=======These unittest doesn't occupy GPU memory, just run as CPU unittest=======* #
-# It run 8 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 16 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
 # just remove it from this list.
 CPU_PARALLEL_JOB = [
-    'test_row_conv',
-    'test_nce',
-    'test_conv3d_mkldnn_op',
-    'dim_test',
-    'test_limit_gpu_memory',
-    'profiler_test',
-    'test_dequantize_mkldnn_op',
-    'test_elementwise_add_bf16_mkldnn_op',
-    'test_rpn_target_assign_op',
-    'test_hash_op',
-    'reader_blocking_queue_test',
-    'jit_kernel_test',
-    'test_tdm_child_op',
-    'test_simplify_with_basic_ops_pass',
-    'test_sequence_last_step',
-    'test_sequence_first_step',
-    'test_seq_concat_fc_fuse_pass',
-    'test_fc_gru_fuse_pass',
-    'test_dataset_imdb',
-    'dlpack_tensor_test',
-    'check_reduce_rank_test',
+    'test_static_save_load_large',
+    'version_test',
     'var_type_traits_test',
     'var_type_inference_test',
+    'variable_test',
+    'unroll_array_ops_test',
+    'tuple_test',
     'to_string_test',
+    'timer_test',
     'threadpool_test',
+    'test_zeros_op',
+    'test_while_op',
+    'test_weight_quantization_mobilenetv1',
     'test_version',
     'test_var_info',
     'test_var_conv_2d',
+    'test_utils',
     'test_unique_name',
     'test_transpose_int8_mkldnn_op',
     'test_transpose_bf16_mkldnn_op',
+    'test_trainer_desc',
     'test_trainable',
     'test_teacher_student_sigmoid_loss_op',
     'test_tdm_sampler_op',
+    'test_tdm_child_op',
+    'test_sysconfig',
+    'test_sync_batch_norm_pass',
     'test_switch',
     'test_static_shape_inferrence_for_shape_tensor',
-    'test_squared_mat_sub_fuse_pass',
-    'test_sequence_scatter_op',
-    'test_sequence_scatter_op',
-    'test_scaled_dot_product_attention',
-    'test_rnn_memory_helper_op',
-    'test_requantize_mkldnn_op',
-    'test_quantize_transpiler',
-    'test_quantize_mkldnn_op',
-    'test_py_reader_sample_generator',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
-    'test_parallel_executor_seresnext_base_cpu',
-    'test_parallel_dygraph_sync_batch_norm',
-    'test_origin_info',
-    'test_multiclass_nms_op',
-    'test_mkldnn_conv_bias_fuse_pass',
-    'test_mkldnn_conv_activation_fuse_pass',
-    'test_matrix_nms_op',
-    'test_ir_graph',
-    'test_inference_api',
-    'test_infer_shape',
-    'test_infer_no_need_buffer_slots',
-    'test_imperative_numpy_bridge',
-    'test_imperative_decorator',
-    'test_hooks',
-    'test_gpu_package_without_gpu_device',
-    'test_global_var_getter_setter',
-    'test_get_set_flags',
-    'test_fusion_repeated_fc_relu_op',
-    'test_fused_emb_seq_pool_op',
-    'test_fleet_base_4',
-    'test_fc_lstm_fuse_pass',
-    'test_executor_feed_non_tensor',
-    'test_executor_check_feed',
-    'test_executor_and_use_program_cache',
-    'test_exception',
-    'test_error_clip',
-    'test_embedding_eltwise_layernorm_fuse_pass',
-    'test_dyn_rnn',
-    'test_dpsgd_op',
-    'test_distributed_reader',
-    'test_directory_migration',
-    'test_dataset_wmt',
-    'test_dataset_uci_housing',
-    'test_dataset_cifar',
-    'test_data_feeder',
-    'test_cudnn_placement_pass',
-    'test_conv3d_layer',
-    'test_concat_bf16_mkldnn_op',
-    'test_common_infer_shape_functions',
-    'test_check_import_scipy',
-    'test_calc_gradient',
-    'test_bipartite_match_op',
-    'test_attention_lstm_op',
-    'test_array_read_write_op',
-    'stringprintf_test',
-    'stringpiece_test',
-    'selected_rows_test',
-    'scope_test',
-    'reader_test',
-    'prune_test',
-    'op_tester',
-    'eigen_test',
-    'device_worker_test',
-    'cudnn_helper_test',
-    'cudnn_desc_test',
-    'tuple_test',
-    'timer_test',
-    'test_zeros_op',
-    'test_while_op',
-    'test_utils',
     'test_static_analysis',
+    'test_squared_mat_sub_fuse_pass',
     'test_split_and_merge_lod_tensor_op',
     'test_spawn_and_init_parallel_env',
     'test_slice_var',
+    'test_skip_layernorm_fuse_pass',
+    'test_simplify_with_basic_ops_pass',
     'test_similarity_focus_op',
     'test_shuffle_batch_op',
     'test_shrink_rnn_memory',
     'test_set_bool_attr',
     'test_sequence_topk_avg_pooling',
+    'test_sequence_scatter_op',
+    'test_sequence_scatter_op',
+    'test_sequence_last_step',
+    'test_sequence_first_step',
+    'test_seqpool_cvm_concat_fuse_pass',
+    'test_seqpool_concat_fuse_pass',
+    'test_seq_concat_fc_fuse_pass',
     'test_selected_rows',
     'test_scope',
+    'test_scale_matmul_fuse_pass',
+    'test_scaled_dot_product_attention',
     'test_sampling_id_op',
     'test_runtime_and_compiletime_exception',
     'test_run_fluid_by_module_or_command_line',
+    'test_rpn_target_assign_op',
+    'test_row_conv',
+    'test_rnn_memory_helper_op',
     'test_retinanet_detection_output',
+    'test_reshape_transpose_matmul_mkldnn_fuse_pass',
+    'test_reshape_bf16_op',
     'test_require_version',
+    'test_requantize_mkldnn_op',
+    'test_repeated_fc_relu_fuse_pass',
     'test_repeated_fc_relu_fuse_pass',
     'test_registry',
+    'test_reducescatter_api',
+    'test_reducescatter',
     'test_recurrent_op',
     'test_recommender_system',
     'test_query_op',
+    'test_quantize_transpiler',
+    'test_quantize_mkldnn_op',
     'test_quantization_mkldnn_pass',
+    'test_quant_int8_resnet50_mkldnn',
+    'test_quant_int8_mobilenetv2_mkldnn',
+    'test_quant_int8_mobilenetv1_mkldnn',
+    'test_quant_int8_googlenet_mkldnn',
+    'test_quant2_int8_resnet50_range_mkldnn',
+    'test_quant2_int8_resnet50_mkldnn',
+    'test_quant2_int8_resnet50_channelwise_mkldnn',
+    'test_quant2_int8_mobilenetv1_mkldnn',
     'test_quant2_int8_mkldnn_pass',
-    'test_pybind_interface',
+    'test_quant2_int8_ernie_mkldnn',
+    'test_py_reader_sample_generator',
+    'test_py_reader_return_list',
+    'test_py_reader_lod_level_share',
     'test_py_reader_error_msg',
+    'test_pyramid_hash_op',
+    'test_pybind_interface',
+    'test_ps_dispatcher',
     'test_prune',
+    'test_protobuf_descs',
     'test_protobuf',
     'test_progressbar',
     'test_program_to_string',
     'test_program_code',
     'test_program',
     'test_precision_recall_op',
+    'test_post_training_quantization_resnet50',
+    'test_post_training_quantization_mobilenetv1',
+    'test_post_training_quantization_mnist',
     'test_positive_negative_pair_op',
-    'test_parallel_executor_run_load_infer_program',
+    'test_paddle_inference_api',
+    'test_origin_info',
     'test_op_version',
     'test_op_support_gpu',
+    'test_operator_desc',
+    'test_operator',
     'test_ones_op',
     'test_npair_loss_op',
     'test_nn_functional_embedding_static',
+    'test_nce',
     'test_name_scope',
+    'test_naive_executor',
     'test_multiprocess_dataloader_iterable_dataset_split',
+    'test_multiprocess_dataloader_exception',
+    'test_multihead_matmul_fuse_pass',
+    'test_multi_gru_seq_fuse_pass',
     'test_multi_gru_mkldnn_op',
+    'test_multi_gru_fuse_pass',
+    'test_multiclass_nms_op',
     'test_mul_int8_mkldnn_op',
     'test_mkldnn_scale_matmul_fuse_pass',
+    'test_mkldnn_placement_pass',
+    'test_mkldnn_op_nhwc',
     'test_mkldnn_op_inplace',
     'test_mkldnn_matmul_transpose_reshape_fuse_pass',
+    'test_mkldnn_matmul_op_output_fuse_pass',
+    'test_mkldnn_inplace_pass',
     'test_mkldnn_inplace_fuse_pass',
     'test_mkldnn_cpu_bfloat16_pass',
+    'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
+    'test_mkldnn_conv_bias_fuse_pass',
+    'test_mkldnn_conv_activation_fuse_pass',
     'test_mine_hard_examples_op',
     'test_memory_usage',
+    'test_matrix_nms_op',
+    'test_matmul_transpose_reshape_fuse_pass',
     'test_matmul_mkldnn_op',
     'test_matmul_bf16_mkldnn_op',
     'test_math_op_patch',
@@ -186,53 +168,100 @@
     'test_lod_tensor_array_ops',
     'test_lod_tensor_array',
     'test_lod_rank_table',
-    'test_lod_array_length_op',
     'test_locality_aware_nms_op',
     'test_load_vars_shape_check',
     'test_load_op_xpu',
     'test_load_op',
-    'test_linear_chain_crf_op',
+    'test_limit_gpu_memory',
     'test_layer_norm_mkldnn_op',
     'test_layer_norm_bf16_mkldnn_op',
+    'test_layer',
     'test_lambv2_op',
+    'test_is_test_pass',
     'test_ir_skip_layernorm_pass',
+    'test_ir_graph',
     'test_io_save_load',
     'test_input_spec',
+    'test_infer_shape',
+    'test_infer_no_need_buffer_slots',
     'test_inference_model_io',
+    'test_inference_api',
+    'test_imperative_signal_handler',
+    'test_imperative_numpy_bridge',
+    'test_imperative_group',
+    'test_imperative_decorator',
+    'test_imperative_data_loader_process',
+    'test_imperative_data_loader_exit_func',
     'test_imperative_base',
     'test_image_classification_layer',
     'test_image',
     'test_ifelse_basic',
     'test_hsigmoid_op',
+    'test_hooks',
+    'test_hash_op',
+    'test_group',
+    'test_graph_pattern_detector',
+    'test_gpu_package_without_gpu_device',
+    'test_global_var_getter_setter',
+    'test_get_set_flags',
     'test_generator',
     'test_generate_proposal_labels_op',
     'test_generate_mask_labels_op',
     'test_gast_with_compatibility',
     'test_fusion_squared_mat_sub_op',
+    'test_fusion_seqpool_cvm_concat_op',
+    'test_fusion_seqpool_concat_op',
+    'test_fusion_seqexpand_concat_fc_op',
     'test_fusion_seqconv_eltadd_relu_op',
+    'test_fusion_repeated_fc_relu_op',
     'test_fusion_lstm_op',
     'test_fusion_gru_op',
+    'test_fusion_gru_mkldnn_op',
     'test_fusion_gru_int8_mkldnn_op',
     'test_fusion_gru_bf16_mkldnn_op',
+    'test_fused_emb_seq_pool_op',
     'test_fused_embedding_fc_lstm_op',
     'test_function_spec',
     'test_full_op',
+    'test_fs_interface',
+    'test_fs',
     'test_framework_debug_str',
     'test_fp16_utils',
+    'test_fleet_util',
+    'test_fleet_unitaccessor',
+    'test_fleet_runtime',
+    'test_fleet_rolemaker_init',
     'test_bf16_utils',
     'test_fleet_rolemaker_4',
+    'test_fleet_rolemaker_3',
+    'test_fleet_rolemaker',
+    'test_fleet_nocvm_1',
+    'test_fleet_base_4',
+    'test_fleet',
+    'test_fleet',
     'test_flags_use_mkldnn',
+    'test_flags_mkldnn_ops_on_off',
     'test_filter_by_instag_op',
     'test_fetch_var',
     'test_fetch_handler',
     'test_feed_fetch_method',
     'test_fc_mkldnn_op',
     'test_fc_lstm_fuse_pass',
+    'test_fc_lstm_fuse_pass',
     'test_fc_gru_fuse_pass',
+    'test_fc_gru_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass',
     'test_fc_bf16_mkldnn_op',
-    'test_entry_attr',
+    'test_executor_feed_non_tensor',
+    'test_executor_check_feed',
+    'test_executor_and_use_program_cache',
+    'test_exception',
+    'test_error_clip',
     'test_entry_attr2',
+    'test_entry_attr',
+    'test_embedding_eltwise_layernorm_fuse_pass',
     'test_elementwise_mul_bf16_mkldnn_op',
+    'test_elementwise_add_bf16_mkldnn_op',
     'test_eager_deletion_recurrent_op',
     'test_eager_deletion_padding_rnn',
     'test_eager_deletion_mnist',
@@ -240,203 +269,658 @@
     'test_eager_deletion_conditional_block',
     'test_dynrnn_static_input',
     'test_dynrnn_gradient_check',
+    'test_dyn_rnn',
     'test_dygraph_mode_of_unittest',
+    'test_dpsgd_op',
+    'test_downpoursgd',
     'test_download',
     'test_distributions',
+    'test_distributed_reader',
+    'test_directory_migration',
     'test_detection_map_op',
     'test_desc_clone',
+    'test_dequantize_mkldnn_op',
     'test_depthwise_conv_mkldnn_pass',
     'test_deprecated_memory_optimize_interfaces',
     'test_default_scope_funcs',
     'test_default_dtype',
+    'test_debugger',
+    'test_dataset_wmt',
     'test_dataset_voc',
+    'test_dataset_uci_housing',
     'test_dataset_movielens',
     'test_dataset_imikolov',
+    'test_dataset_imdb',
     'test_dataset_conll05',
+    'test_dataset_cifar',
+    'test_dataloader_unkeep_order',
+    'test_dataloader_keep_order',
+    'test_dataloader_dataset',
     'test_data_generator',
+    'test_data_feeder',
     'test_data',
     'test_cyclic_cifar_dataset',
+    'test_cudnn_placement_pass',
     'test_crypto',
+    'test_crf_decoding_op',
+    'test_create_parameter',
     'test_create_op_doc_string',
     'test_create_global_var',
+    'test_cpu_quantize_squash_pass',
+    'test_cpu_quantize_placement_pass',
+    'test_cpu_quantize_pass',
+    'test_cpu_bfloat16_placement_pass',
+    'test_cpu_bfloat16_pass',
+    'test_conv_elementwise_add_mkldnn_fuse_pass',
+    'test_conv_concat_relu_mkldnn_fuse_pass',
+    'test_conv_bias_mkldnn_fuse_pass',
+    'test_conv_batch_norm_mkldnn_fuse_pass',
+    'test_conv_activation_mkldnn_fuse_pass',
     'test_conv3d_transpose_layer',
+    'test_conv3d_mkldnn_op',
+    'test_conv3d_layer',
     'test_conv2d_transpose_layer',
     'test_conv2d_mkldnn_op',
     'test_conv2d_layer',
     'test_conv2d_int8_mkldnn_op',
     'test_conv2d_bf16_mkldnn_op',
+    'test_context_manager',
     'test_const_value',
     'test_conditional_block',
     'test_concat_int8_mkldnn_op',
+    'test_concat_bf16_mkldnn_op',
     'test_compat',
-    'test_collective_base',
-    'test_collective_api_base',
+    'test_common_infer_shape_functions',
     'test_chunk_eval_op',
+    'test_check_import_scipy',
+    'test_c_comm_init_all_op',
+    'test_calc_gradient',
     'test_broadcast_to_op',
     'test_broadcast_shape',
     'test_broadcast_error',
+    'test_broadcast',
     'test_bpr_loss_op',
+    'test_boxps',
+    'test_bipartite_match_op',
+    'test_benchmark',
     'test_beam_search_op',
     'test_batch_sampler',
+    'test_batch_norm_act_fuse_pass',
     'test_basic_rnn_name',
+    'test_attention_lstm_op',
+    'test_analyzer',
+    'test_allreduce',
+    'test_allgather',
     'test_aligned_allocator',
+    'system_allocator_test',
+    'stringprintf_test',
+    'stringpiece_test',
+    'split_test',
+    'selected_rows_test',
+    'selected_rows_functor_test',
+    'scope_test',
     'scatter_test',
+    'save_quant2_model_resnet50',
+    'save_quant2_model_gru',
+    'save_quant2_model_ernie',
+    'save_load_util_test',
+    'save_load_op_test',
     'save_load_combine_op_test',
+    'rw_lock_test',
+    'retry_allocator_test',
+    'reader_test',
+    'reader_blocking_queue_test',
+    'prune_test',
     'program_desc_test',
-    'lodtensor_printer_test',
-    'lod_tensor_test',
-    'gather_test',
-    'gather_op_test',
-    'fused_broadcast_op_test',
-    'exception_holder_test',
-    'decorator_test',
-    'ddim_test',
-    'data_layout_transform_test',
-    'cpu_vec_test',
-    'cow_ptr_tests',
-    'conditional_block_op_test',
-    'bfloat16_test',
-    'assign_op_test',
-    'unroll_array_ops_test',
-    'test_seqpool_cvm_concat_fuse_pass',
-    'test_seqpool_concat_fuse_pass',
-    'test_reshape_bf16_op',
-    'test_repeated_fc_relu_fuse_pass',
-    'test_py_reader_return_list',
-    'test_py_reader_lod_level_share',
-    'test_protobuf_descs',
-    'test_paddle_inference_api',
-    'test_operator_desc',
-    'test_operator',
-    'test_mkldnn_matmul_op_output_fuse_pass',
-    'test_mkldnn_inplace_pass',
-    'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
-    'test_layer',
-    'test_is_test_pass',
-    'test_graph_pattern_detector',
-    'test_fusion_seqpool_cvm_concat_op',
-    'test_fusion_seqpool_concat_op',
-    'test_fusion_seqexpand_concat_fc_op',
-    'test_fusion_gru_mkldnn_op',
-    'test_fleet_util',
-    'test_fleet_runtime',
-    'test_fleet_rolemaker_init',
-    'test_flags_mkldnn_ops_on_off',
-    'test_dataset_download',
-    'test_dataloader_unkeep_order',
-    'test_dataloader_keep_order',
-    'test_dataloader_dataset',
-    'test_crf_decoding_op',
-    'test_create_parameter',
-    'test_context_manager',
-    'test_analyzer',
-    'tensor_test',
-    'split_test',
-    'save_load_op_test',
+    'profiler_test',
     'place_test',
+    'pass_test',
     'op_version_registry_test',
+    'op_tester',
     'op_proto_maker_test',
     'op_kernel_type_test',
-    'mask_util_test',
-    'inlined_vector_test',
-    'infer_io_utils_tester',
-    'errors_test',
-    'enforce_test',
-    'dropout_op_test',
-    'data_type_test',
-    'cpu_info_test',
-    'cpu_helper_test',
-    'beam_search_decode_op_test',
-    'auto_growth_best_fit_allocator_test',
-    'test_skip_layernorm_fuse_pass',
-    'test_multihead_matmul_fuse_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
-    'version_test',
-    'variable_test',
-    'test_scale_matmul_fuse_pass',
-    'test_reshape_transpose_matmul_mkldnn_fuse_pass',
-    'test_multi_gru_seq_fuse_pass',
-    'test_multi_gru_fuse_pass',
-    'test_mkldnn_placement_pass',
-    'test_mkldnn_op_nhwc',
-    'test_matmul_transpose_reshape_fuse_pass',
-    'test_fs',
-    'test_fleet',
-    'test_cpu_quantize_squash_pass',
-    'test_cpu_quantize_placement_pass',
-    'test_cpu_quantize_pass',
-    'test_cpu_bfloat16_placement_pass',
-    'test_cpu_bfloat16_pass',
-    'test_conv_elementwise_add_mkldnn_fuse_pass',
-    'test_conv_concat_relu_mkldnn_fuse_pass',
-    'test_conv_bias_mkldnn_fuse_pass',
-    'test_conv_batch_norm_mkldnn_fuse_pass',
-    'test_conv_activation_mkldnn_fuse_pass',
-    'test_benchmark',
-    'test_batch_norm_act_fuse_pass',
-    'selected_rows_functor_test',
-    'save_load_util_test',
-    'pass_test',
     'operator_test',
     'operator_exception_test',
     'op_debug_string_test',
     'op_compatible_info_test',
     'op_call_stack_test',
-    'node_test',
     'no_need_buffer_vars_inference_test',
+    'node_test',
     'nccl_context_test',
+    'mmap_allocator_test',
     'math_function_test',
+    'mask_util_test',
+    'lod_tensor_test',
+    'test_check_abi',
+    'lodtensor_printer_test',
+    'jit_kernel_test',
+    'test_dispatch_jit',
+    'inlined_vector_test',
     'init_test',
+    'infer_io_utils_tester',
     'graph_to_program_pass_test',
     'graph_test',
     'graph_helper_test',
+    'gather_test',
+    'gather_op_test',
+    'fused_broadcast_op_test',
     'float16_test',
+    'exception_holder_test',
+    'errors_test',
+    'enforce_test',
+    'eigen_test',
+    'dropout_op_test',
+    'dlpack_tensor_test',
     'dist_multi_trainer_test',
+    'dim_test',
+    'device_worker_test',
+    'decorator_test',
+    'ddim_test',
+    'data_type_test',
+    'test_check_error',
+    'data_layout_transform_test',
+    'cudnn_helper_test',
+    'cudnn_desc_test',
+    'cpu_vec_test',
+    'cpu_info_test',
+    'cpu_helper_test',
+    'cow_ptr_tests',
+    'convert_model2dot_ernie',
+    'conditional_block_op_test',
     'cipher_utils_test',
+    'check_reduce_rank_test',
+    'buffered_allocator_test',
     'broadcast_op_test',
+    'bfloat16_test',
+    'beam_search_decode_op_test',
+    'auto_growth_best_fit_allocator_test',
+    'assign_op_test',
+    'allocator_facade_frac_flags_test',
     'aes_cipher_test',
 ]
 
 # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
 # just remove it from this list.
 TETRAD_PARALLEL_JOB = [
-    'system_allocator_test',
     'buffered_allocator_test',
-    'test_tensor_to_numpy',
+    'allocator_facade_frac_flags_test',
+    'cuda_helper_test',
+    'sequence_padding_test',
+    'test_auto_growth_gpu_memory_limit',
     'test_imperative_framework',
+    'device_context_test',
+    'test_reference_count_pass_last_lived_ops',
+    'copy_same_tensor_test',
+    'float16_gpu_test',
+    'test_leaky_relu_grad_grad_functor',
+    'sequence_pooling_test',
+    'mixed_vector_test',
+    'op_registry_test',
+    'strided_memcpy_test',
+    'selected_rows_functor_gpu_test',
+    'test_prepare_op',
+    'data_device_transform_test',
+    'test_tensor_to_numpy',
     'test_naive_best_fit_gpu_memory_limit',
-    'test_auto_growth_gpu_memory_limit',
+    'vol2col_test',
     'test_imperative_using_non_zero_gpu',
-    'cuda_helper_test',
     'retry_allocator_test',
-    'allocator_facade_frac_flags_test',
+    'system_allocator_test',
+    'test_fc_fuse_pass_cc',
+    'test_fc_lstm_fuse_pass_cc',
+    'test_fc_gru_fuse_pass_cc',
+    'test_conv_bn_fuse_pass_cc',
+    'test_adaptive_pool2d_convert_global_pass',
+    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_layer_norm_fuse_pass_cc',
+    'test_fc_act_mkldnn_fuse_pass',
+    'test_fleet_cc',
+    'tensor_test',
+    'test_repeated_fc_relu_fuse_pass_cc',
+    'test_mkldnn_caching',
+]
+
+# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# just remove it from this list.
+TWO_PARALLEL_JOB = [
+    'im2col_test',
+    'test_elementwise_add_grad_grad',
+    'test_logical_op',
+    'test_imperative_mnist',
+    'test_imperative_deepcf',
+    'test_cholesky_op',
+    'test_multiprocess_dataloader_iterable_dataset_static',
+    'test_sample_logits_op',
+    'test_ir_fc_fuse_pass',
+    'test_imperative_qat_channelwise',
+    'test_fleet_base_single',
+    'test_imperative_out_scale',
+    'test_multiprocess_dataloader_iterable_dataset_dynamic',
+    'test_fill_op',
+    'test_slice_op',
+    'test_cond',
+    'test_compiled_program',
+    'test_lstm',
+    'test_ema',
+    'test_py_reader_using_executor',
+    'test_nan_inf',
+    'test_isinstance',
+    'test_jit_save_load',
+    'test_box_clip_op',
+    'test_group_norm_op',
+    'test_seed_op',
+    'test_activation_nn_grad',
+    'test_pool2d_int8_mkldnn_op',
+    'test_adagrad_op_v2',
+    'test_elementwise_add_op',
+    'test_nn_functional_hot_op',
+    'test_op_name_conflict',
+    'test_softmax_with_cross_entropy_op',
+    'test_imperative_gan',
+    'test_simnet',
+    'test_instance_norm_op',
+    'test_amp_check_finite_and_scale_op',
+    'test_random_seed',
+    'test_histogram_op',
+    'test_sequence_conv',
+    'test_eye_op',
+    'test_row_conv_op',
+    'test_full_like_op',
+    'test_optimizer_in_control_flow',
+    'test_gru_unit_op',
+    'test_distribute_fpn_proposals_op',
+    'test_log_loss_op',
+    'test_adadelta_op',
+    'test_diag_embed',
+    'test_unsqueeze2_op',
+    'test_fused_fc_elementwise_layernorm_op',
+    'test_sum_bf16_mkldnn_op',
+    'test_sequence_erase_op',
+    'test_sigmoid_cross_entropy_with_logits_op',
+    'test_regularizer_api',
+    'test_lrn_op',
+    'test_rank_attention_op',
+    'test_parallel_ssa_graph_inference_feed_partial_data',
+    'test_lod_reset_op',
+    'test_install_check',
+    'test_anchor_generator_op',
+    'test_imperative_ptb_rnn',
+    'test_gather_nd_op',
+    'test_flatten_contiguous_range_op',
+    'test_network_with_dtype',
+    'test_elementwise_sub_op',
+    'test_assert_op',
+    'test_elementwise_div_op',
+    'test_gather_tree_op',
+    'test_decoupled_py_reader',
+    'test_imperative_named_members',
+    'test_conv3d_op',
+    'test_seqconv_eltadd_relu_fuse_pass',
+    'test_analysis_predictor',
+    'test_convert_operators',
+    'test_add_reader_dependency',
+    'test_is_tensor',
+    'test_variable',
+    'test_unsqueeze_op',
+    'test_save_model_without_var',
+    'test_unfold_op',
+    'test_conv_bn_fuse_pass',
+    'test_truncated_gaussian_random_op',
+    'test_tree_conv_op',
+    'test_traced_layer_err_msg',
+    'test_unique_with_counts',
+    'test_auc_single_pred_op',
+    'test_stack_op',
+    'test_conv_bn_fuse_pass',
+    'test_instance_norm_op_v2',
+    'test_softmax_bf16_mkldnn_op',
+    'test_mean_iou',
+    'test_sequence_slice_op',
+    'test_polygon_box_transform',
+    'test_sequence_pad_op',
+    'test_sequence_expand',
+    'test_cudnn_grucell',
+    'test_pool2d_bf16_mkldnn_op',
+    'test_bilinear_api',
+    'test_parallel_executor_inference_feed_partial_data',
+    'test_initializer_nn',
+    'test_modified_huber_loss_op',
+    'test_lookup_table_op',
+    'test_conv1d_layer',
+    'test_kron_op',
+    'test_isfinite_v2_op',
+    'test_ctc_align',
+    'test_imperative_save_load_v2',
+    'test_decayed_adagrad_op',
+    'test_generator_dataloader',
+    'test_dropout_op',
+    'test_functional_conv3d',
+    'test_executor_return_tensor_not_overwriting',
+    'test_flatten2_op',
+    'test_fsp_op',
+    'test_fusion_transpose_flatten_concat_op',
+    'test_elementwise_nn_grad',
+    'test_hinge_loss_op',
+    'test_elementwise_add_mkldnn_op',
+    'test_optimizer',
+    'test_deformable_conv_op',
+    'test_py_reader_push_pop',
+    'test_random_crop_op',
+    'test_shuffle_channel_op',
+    'test_center_loss',
+    'test_temporal_shift_op',
+    'test_case',
+    'test_transformer_api',
+    'test_bmm_op',
+    'test_adagrad_op',
+    'test_batch_norm_mkldnn_op',
+    'test_adam_op_multi_thread',
+    'test_adamax_op',
+    'test_while_loop_op',
+    'test_affine_grid_function',
+    'test_trilinear_interp_op',
+    'test_transpose_flatten_concat_fuse_pass',
+    'test_trace_op',
+    'test_backward',
+    'test_top_k_op',
+    'test_batch_fc_op',
+    'test_tensor_scalar_type_promotion_static',
+    'test_squared_l2_distance_op',
+    'test_bicubic_interp_op',
+    'test_spp_op',
+    'test_space_to_depth_op',
+    'test_callbacks',
+    'test_sigmoid_focal_loss_op',
+    'test_collect_fpn_proposals_op',
+    'test_sgd_op',
+    'test_sequence_unpad_op',
+    'test_conv1d_transpose_layer',
+    'test_sequence_slice_op',
+    'test_sequence_pool',
+    'test_conv_elementwise_add_fuse_pass',
+    'test_sequence_pad_op',
+    'test_conv_shift_op',
+    'test_sequence_expand_as',
+    'test_cos_sim_op',
+    'test_sequence_enumerate_op',
+    'test_cross_entropy2_op',
+    'test_sequence_concat',
+    'test_cudnn_lstmcell',
+    'test_data_norm_op',
+    'test_decoupled_py_reader_data_check',
+    'test_deformable_conv_v1_op',
+    'test_roi_align_op',
+    'test_detach',
+    'test_rnn_cells',
+    'test_elementwise_floordiv_op',
+    'test_elementwise_min_op',
+    'test_reduce_op',
+    'test_embedding_id_stop_gradient',
+    'test_empty_op',
+    'test_py_reader_combination',
+    'test_ptb_lm',
+    'test_expand_op',
+    'test_prroi_pool_op',
+    'test_fake_dequantize_op',
+    'test_fetch_feed',
+    'test_prelu_op',
+    'test_fill_zeros_like_op',
+    'test_pool2d_op',
+    'test_for_enumerate',
+    'test_gather_op',
+    'test_partial_concat_op',
+    'test_gaussian_random_op',
+    'test_paddle_imperative_double_grad',
+    'test_generate_proposals_v2_op',
+    'test_pad_constant_like',
+    'test_grid_sample_function',
+    'test_pad2d_op',
+    'test_huber_loss_op',
+    'test_one_hot_op',
+    'test_normal',
+    'test_imperative_auto_prune',
+    'test_nn_grad',
+    'test_nearest_interp_op',
+    'test_minus_op',
+    'test_imperative_reinforcement',
+    'test_maxout_op',
+    'test_matmul_op',
+    'test_increment',
+    'test_masked_select_op',
+    'test_lstmp_op',
+    'test_loop',
+    'test_label_smooth_op',
+    'test_logsumexp',
+    'test_log_softmax',
+    'test_learning_rate_scheduler',
+    'test_linspace',
+    'test_linear_interp_op',
+    'test_layer_norm_op_v2',
+    'test_lamb_op',
+    'test_lookup_table_v2_op',
+    'test_l1_norm_op',
+    'test_lstm_op',
+    'test_margin_rank_loss_op',
+    'test_index_sample_op',
+    'test_imperative_static_runner_while',
+    'test_imperative_save_load',
+    'test_imperative_ptb_rnn_sorted_gradient',
+    'test_mul_op',
+    'test_imperative_lod_tensor_to_selected_rows',
+    'test_imperative_data_parallel',
+    'test_norm_nn_grad',
+    'test_im2sequence_op',
+    'test_if_else_op',
+    'test_one_hot_v2_op',
+    'test_grid_sampler_op',
+    'test_pad_op',
+    'test_generate_proposals_op',
+    'test_parameter',
+    'test_gaussian_random_mkldnn_op',
+    'test_partial_sum_op',
+    'test_ftrl_op',
+    'test_flip',
+    'test_pool_max_op',
+    'test_prior_box_op',
+    'test_fake_quantize_op',
+    'test_proximal_gd_op',
+    'test_expand_v2_op',
+    'test_psroi_pool_op',
+    'test_expand_as_v2_op',
+    'test_ptb_lm_v2',
+    'test_rand_op',
+    'test_empty_like_op',
+    'test_rank_loss_op',
+    'test_elementwise_mod_op',
+    'test_reinforcement_learning',
+    'test_elementwise_max_op',
+    'test_retain_graph',
+    'test_edit_distance_op',
+    'test_reverse_op',
+    'test_device_guard',
+    'test_rnn_cells_static',
+    'test_deformable_psroi_pooling',
+    'test_roi_perspective_transform_op',
+    'test_segment_ops',
+    'test_cvm_op',
+    'test_selu_op',
+    'test_cross_op',
+    'test_sequence_conv',
+    'test_crop_tensor_op',
+    'test_sequence_expand',
+    'test_sequence_mask',
+    'test_conv_nn_grad',
+    'test_sequence_pool',
+    'test_conv_elementwise_add2_act_fuse_pass',
+    'test_sequence_reshape',
+    'test_conv2d_fusion_op',
+    'test_sequence_softmax_op',
+    'test_sequence_unpad_op',
+    'test_compare_reduce_op',
+    'test_clip_by_norm_op',
+    'test_box_coder_op',
+    'test_smooth_l1_loss_op',
+    'test_bilinear_interp_op',
+    'test_spectral_norm_op',
+    'test_sum_mkldnn_op',
+    'test_batch_norm_op',
+    'test_base_layer',
+    'test_argsort_op',
+    'test_arg_min_max_op',
+    'test_transpose_op',
+    'test_affine_grid_op',
+    'test_unpool_op',
+    'test_addmm_op',
+    'test_adam_optimizer_fp32_fp64',
+    'test_auc_op',
+    'test_adam_op',
+    'test_bilinear_tensor_product_op',
+    'test_break_continue',
+    'test_transpose_mkldnn_op',
+    'test_callback_reduce_lr_on_plateau',
+    'test_cast_op',
+    'test_scatter_nd_op',
+    'test_conv2d_transpose_op_depthwise_conv',
+    'test_queue',
+    'test_cross_entropy_op',
+    'test_detection',
+    'test_elementwise_mul_mkldnn_op',
+    'test_grid_generator',
+    'test_functional_conv2d',
+    'test_fit_a_line',
+    'test_fill_any_like_op',
+    'test_functional_conv2d_transpose',
+    'test_functional_conv3d_transpose',
+    'test_dot_op',
+    'test_gru_op',
+    'test_device',
+    'test_imperative_layer_apply',
+    'test_dataloader_early_reset',
+    'test_imperative_selected_rows_to_lod_tensor',
+    'test_crop_op',
+    'test_linear_interp_v2_op',
+    'test_lr_scheduler',
+    'test_tensor_array_to_tensor',
+    'test_mean_op',
+    'test_momentum_op',
+    'test_iou_similarity_op',
+    'test_optimizer_grad',
+    'test_dygraph_weight_norm',
+    'test_batch_norm_op_v2',
+    'test_pool2d_mkldnn_op',
+    'test_regularizer',
+    'test_sequence_concat',
+    'test_sequence_expand_as',
+    'test_sequence_reverse',
+    'test_shape_op',
+    'test_lod_tensor',
+    'test_diag',
+    'test_strided_slice_op',
+    'test_switch_case',
+    'test_target_assign_op',
+    'test_translated_layer',
+    'test_isfinite_op',
+    'test_conv_elementwise_add_act_fuse_pass',
+    'test_unbind_op',
+    'test_size_op',
+    'test_unique',
+    'test_unstack_op',
+    'test_wrappers',
+    'test_deprecated_decorator',
+    'test_affine_channel_op',
+    'test_arange',
+    'test_lrn_mkldnn_op',
+    'test_imperative_gnn',
+    'test_eager_deletion_while_op',
+    'test_dequantize_abs_max_op',
+    'test_elementwise_mul_op',
+    'test_tensor_scalar_type_promotion_dynamic',
+    'test_fc_op',
+    'test_mish_op',
+    'test_flatten_op',
+    'test_gradient_clip',
+    'test_allclose_layer',
+    'test_meshgrid_op',
+    'test_get_places_op',
+    'test_reader_reset',
+    'test_squared_l2_norm_op',
+    'test_softmax_mkldnn_op',
+    'test_numel_op',
+    'test_squeeze2_op',
+    'test_dygraph_mnist_fp16',
+    'test_activation_mkldnn_op',
+    'test_imperative_layer_children',
+    'test_nearest_interp_v2_op',
+    'test_fill_zeros_like2_op',
+    'test_sync_batch_norm_op',
+    'test_static_save_load',
+    'test_coalesce_tensor_op',
+    'test_fuse_bn_act_pass',
+    'test_simnet_v2',
+    'test_shard_index_op',
+    'test_cuda_random_seed',
+    'test_dequantize_log_op',
+    'test_mkldnn_batch_norm_act_fuse_pass',
+    'test_imperative_skip_op',
+    'test_proximal_adagrad_op',
+    'test_word2vec',
+    'test_conv2d_transpose_mkldnn_op',
+    'test_imperative_optimizer',
+    'test_assign_value_op',
+    'test_roi_pool_op',
+    'test_imperative_basic',
+    'test_word2vec',
+    'test_manual_seed',
+    'test_buffer_shared_memory_reuse_pass',
+    'test_range',
+    'test_activation_op',
+    'test_box_decoder_and_assign_op',
+    'test_imperative_optimizer_v2',
+    'test_python_operator_overriding',
+    'test_is_empty_op',
+    'test_imperative_qat',
+    'test_py_reader_pin_memory',
+    'test_train_recognize_digits',
+    'test_parallel_executor_feed_persistable_var',
+    'test_mnist',
+    'test_update_loss_scaling_op',
+    'test_rnn_cell_api',
+    'test_parallel_executor_fetch_isolated_var',
+    'test_imperative_load_static_param',
+    'test_fuse_bn_add_act_pass',
+    'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
 ]
 
 
 def main():
-    eight_parallel_job = '^job$'
+    cpu_parallel_job = '^job$'
     tetrad_parallel_job = '^job$'
-    non_parallel_job_1 = '^job$'
-    non_parallel_job_2 = '^job$'
+    two_parallel_job = '^job$'
+    non_parallel_job = '^job$'
 
     test_cases = sys.argv[1]
     test_cases = test_cases.split("\n")
-    for unittest in test_cases:
-        if unittest in CPU_PARALLEL_JOB:
-            eight_parallel_job = eight_parallel_job + '|^' + unittest + '$'
-            continue
-        if unittest in TETRAD_PARALLEL_JOB:
+
+    for unittest in CPU_PARALLEL_JOB:
+        if unittest in test_cases:
+            cpu_parallel_job = cpu_parallel_job + '|^' + unittest + '$'
+            test_cases.remove(unittest)
+
+    for unittest in TETRAD_PARALLEL_JOB:
+        if unittest in test_cases:
             tetrad_parallel_job = tetrad_parallel_job + '|^' + unittest + '$'
-            continue
+            test_cases.remove(unittest)
 
-        if len(non_parallel_job_1) < 10000:
-            non_parallel_job_1 = non_parallel_job_1 + '|^' + unittest + '$'
-        else:
-            non_parallel_job_2 = non_parallel_job_2 + '|^' + unittest + '$'
+    for unittest in TWO_PARALLEL_JOB:
+        if unittest in test_cases:
+            two_parallel_job = two_parallel_job + '|^' + unittest + '$'
+            test_cases.remove(unittest)
+
+    for unittest in test_cases:
+        non_parallel_job = non_parallel_job + '|^' + unittest + '$'
 
-    non_parallel_job = ",".join([non_parallel_job_1, non_parallel_job_2])
-    print("{};{};{}".format(eight_parallel_job, tetrad_parallel_job,
-                            non_parallel_job))
+    print("{};{};{};{}".format(cpu_parallel_job, tetrad_parallel_job,
+                               two_parallel_job, non_parallel_job))
 
 
 if __name__ == '__main__':
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index ce0490d487fbe7..fde01329340b2b 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -22,6 +22,10 @@
 import paddle
 import paddle.fluid
 import json
+import argparse
+import shutil
+import re
+import logging
 """
 please make sure to run in the tools path
 usage: python sample_test.py {arg1} 
@@ -33,6 +37,26 @@
 
 """
 
+logger = logging.getLogger()
+if logger.handlers:
+    console = logger.handlers[
+        0]  # we assume the first handler is the one we want to configure
+else:
+    console = logging.StreamHandler()
+    logger.addHandler(console)
+console.setFormatter(
+    logging.Formatter(
+        "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"))
+
+RUN_ON_DEVICE = 'cpu'
+GPU_ID = 0
+methods = []
+whl_error = []
+API_DEV_SPEC_FN = 'paddle/fluid/API_DEV.spec'
+API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec'
+API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec'
+SAMPLECODE_TEMPDIR = 'samplecode_temp'
+
 
 def find_all(srcstr, substr):
     """
@@ -98,9 +122,13 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
 
     Returns:
         result: True or False
+        name(str): the name of the API.
+        msg(str): messages
     """
+    global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR
 
     result = True
+    msg = None
 
     def sampcd_header_print(name, sampcd, htype, hname):
         """
@@ -113,7 +141,8 @@ def sampcd_header_print(name, sampcd, htype, hname):
             hname(str): the name of the hint  banners , e.t. def hname.
             flushed.
         """
-        print_header(htype, hname)
+        print(htype, " name:", hname)
+        print("-----------------------")
         print("Sample code ", str(y), " extracted for ", name, "   :")
         print(sampcd)
         print("----example code check----\n")
@@ -122,11 +151,9 @@ def sampcd_header_print(name, sampcd, htype, hname):
 
     sampcd_begins = find_all(srccom, " code-block:: python")
     if len(sampcd_begins) == 0:
-        print_header(htype, hname)
-        '''
-        detect sample codes using >>> to format
-        and consider this situation as wrong
-        '''
+        # detect sample codes using >>> to format and consider this situation as wrong
+        print(htype, " name:", hname)
+        print("-----------------------")
         if srccom.find("Examples:") != -1:
             print("----example code check----\n")
             if srccom.find(">>>") != -1:
@@ -164,23 +191,22 @@ def sampcd_header_print(name, sampcd, htype, hname):
                 sampcd_to_write.append(cdline[min_indent:])
 
         sampcd = '\n'.join(sampcd_to_write)
-        if sys.argv[1] == "cpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
-        if sys.argv[1] == "gpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = "0"\n' + sampcd
+        if RUN_ON_DEVICE == "cpu":
+            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
+        if RUN_ON_DEVICE == "gpu":
+            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
+                GPU_ID) + sampcd
         sampcd += '\nprint(' + '\"' + name + ' sample code is executed successfully!\")'
 
-        if len(sampcd_begins) > 1:
-            tfname = name + "_example_" + str(y) + ".py"
-        else:
-            tfname = name + "_example" + ".py"
-        tempf = open("samplecode_temp/" + tfname, 'w')
-        tempf.write(sampcd)
-        tempf.close()
+        tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
+            name, '.py' if len(sampcd_begins) == 1 else '_{}.py'.format(y)))
+        logging.info('running %s', tfname)
+        with open(tfname, 'w') as tempf:
+            tempf.write(sampcd)
         if platform.python_version()[0] == "2":
-            cmd = ["python", "samplecode_temp/" + tfname]
+            cmd = ["python", tfname]
         elif platform.python_version()[0] == "3":
-            cmd = ["python3", "samplecode_temp/" + tfname]
+            cmd = ["python3", tfname]
         else:
             print("Error: fail to parse python version!")
             result = False
@@ -199,11 +225,12 @@ def sampcd_header_print(name, sampcd, htype, hname):
             print("Error Raised from Sample Code ", name, " :\n")
             print(err)
             print(msg)
+            logging.warning('%s error: %s', tfname, err)
+            logging.warning('%s msg: %s', tfname, msg)
             result = False
         # msg is the returned code execution report
-        #os.remove("samplecode_temp/" + tfname)
 
-    return result
+    return result, name, msg
 
 
 def single_defcom_extract(start_from, srcls, is_class_begin=False):
@@ -264,12 +291,7 @@ def single_defcom_extract(start_from, srcls, is_class_begin=False):
     return fcombody
 
 
-def print_header(htype, name):
-    print(htype, " name:", name)
-    print("-----------------------")
-
-
-def srccoms_extract(srcfile, wlist):
+def srccoms_extract(srcfile, wlist, methods):
     """
     Given a source file ``srcfile``, this function will
     extract its API(doc comments) and run sample codes in the
@@ -278,12 +300,15 @@ def srccoms_extract(srcfile, wlist):
     Args:
         srcfile(file): the source file
         wlist(list): white list
+        methods(list): only elements of this list considered.
 
     Returns:
         result: True or False
+        error_methods: the methods that failed.
     """
 
     process_result = True
+    error_methods = []
     srcc = srcfile.read()
     # 2. get defs and classes header line number
     # set file pointer to its beginning
@@ -292,8 +317,8 @@ def srccoms_extract(srcfile, wlist):
 
     # 1. fetch__all__ list
     allidx = srcc.find("__all__")
-    srcfile_new = srcfile.name
-    srcfile_new = srcfile_new.replace('.py', '')
+    logger.debug('processing %s, methods: %s', srcfile.name, str(methods))
+    srcfile_new, _ = os.path.splitext(srcfile.name)
     srcfile_list = srcfile_new.split('/')
     srcfile_str = ''
     for i in range(4, len(srcfile_list)):
@@ -323,15 +348,27 @@ def srccoms_extract(srcfile, wlist):
             if '' in alllist:
                 alllist.remove('')
         api_alllist_count = len(alllist)
+        logger.debug('found %d items: %s', api_alllist_count, str(alllist))
         api_count = 0
         handled = []
         # get src contents in layers/ops.py
         if srcfile.name.find("ops.py") != -1:
             for i in range(0, len(srcls)):
-                if srcls[i].find("__doc__") != -1:
-                    opname = srcls[i][:srcls[i].find("__doc__") - 1]
+                opname = None
+                opres = re.match(r"^(\w+)\.__doc__", srcls[i])
+                if opres is not None:
+                    opname = opres.group(1)
+                else:
+                    opres = re.match(
+                        r"^add_sample_code\(globals\(\)\[\"(\w+)\"\]", srcls[i])
+                    if opres is not None:
+                        opname = opres.group(1)
+                if opname is not None:
                     if opname in wlist:
+                        logger.info('%s is in the whitelist, skip it.', opname)
                         continue
+                    else:
+                        logger.debug('%s\'s docstring found.', opname)
                     comstart = i
                     for j in range(i, len(srcls)):
                         if srcls[j].find("\"\"\"") != -1:
@@ -341,51 +378,73 @@ def srccoms_extract(srcfile, wlist):
                         opcom += srcls[j]
                         if srcls[j].find("\"\"\"") != -1:
                             break
+                    result, _, _ = sampcd_extract_and_run(opcom, opname, "def",
+                                                          opname)
+                    if not result:
+                        error_methods.append(opname)
+                        process_result = False
                     api_count += 1
                     handled.append(
                         opname)  # ops.py also has normal formatted functions
                     # use list 'handled'  to mark the functions have been handled here
                     # which will be ignored in the following step
+                    # handled what?
+        logger.debug('%s already handled.', str(handled))
         for i in range(0, len(srcls)):
             if srcls[i].startswith(
                     'def '):  # a function header is detected in line i
                 f_header = srcls[i].replace(" ", '')
                 fn = f_header[len('def'):f_header.find('(')]  # function name
                 if "%s%s" % (srcfile_str, fn) not in methods:
+                    logger.info(
+                        '[file:%s, function:%s] not in methods list, skip it.',
+                        srcfile_str, fn)
                     continue
                 if fn in handled:
                     continue
                 if fn in alllist:
                     api_count += 1
                     if fn in wlist or fn + "@" + srcfile.name in wlist:
+                        logger.info('[file:%s, function:%s] skip by wlist.',
+                                    srcfile_str, fn)
                         continue
                     fcombody = single_defcom_extract(i, srcls)
                     if fcombody == "":  # if no comment
-                        print_header("def", fn)
+                        print("def name:", fn)
+                        print("-----------------------")
                         print("WARNING: no comments in function ", fn,
                               ", but it deserves.")
                         continue
                     else:
-                        if not sampcd_extract_and_run(fcombody, fn, "def", fn):
+                        result, _, _ = sampcd_extract_and_run(fcombody, fn,
+                                                              "def", fn)
+                        if not result:
+                            error_methods.append(fn)
                             process_result = False
 
             if srcls[i].startswith('class '):
                 c_header = srcls[i].replace(" ", '')
                 cn = c_header[len('class'):c_header.find('(')]  # class name
                 if '%s%s' % (srcfile_str, cn) not in methods:
+                    logger.info(
+                        '[file:%s, class:%s] not in methods list, skip it.',
+                        srcfile_str, cn)
                     continue
                 if cn in handled:
                     continue
                 if cn in alllist:
                     api_count += 1
                     if cn in wlist or cn + "@" + srcfile.name in wlist:
+                        logger.info('[file:%s, class:%s] skip by wlist.',
+                                    srcfile_str, cn)
                         continue
                     # class comment
                     classcom = single_defcom_extract(i, srcls, True)
                     if classcom != "":
-                        if not sampcd_extract_and_run(classcom, cn, "class",
-                                                      cn):
-
+                        result, _, _ = sampcd_extract_and_run(classcom, cn,
+                                                              "class", cn)
+                        if not result:
+                            error_methods.append(cn)
                             process_result = False
                     else:
                         print("WARNING: no comments in class itself ", cn,
@@ -410,10 +469,19 @@ def srccoms_extract(srcfile, wlist):
                                 if '%s%s' % (
                                         srcfile_str, name
                                 ) not in methods:  # class method not in api.spec 
+                                    logger.info(
+                                        '[file:%s, func:%s] not in methods, skip it.',
+                                        srcfile_str, name)
                                     continue
                                 if mn.startswith('_'):
+                                    logger.info(
+                                        '[file:%s, func:%s] startswith _, it\'s private method, skip it.',
+                                        srcfile_str, name)
                                     continue
                                 if name in wlist or name + "@" + srcfile.name in wlist:
+                                    logger.info(
+                                        '[file:%s, class:%s] skip by wlist.',
+                                        srcfile_str, name)
                                     continue
                                 thismethod = [thisl[indent:]
                                               ]  # method body lines
@@ -434,22 +502,38 @@ def srccoms_extract(srcfile, wlist):
                                 thismtdcom = single_defcom_extract(0,
                                                                    thismethod)
                                 if thismtdcom != "":
-                                    if not sampcd_extract_and_run(
-                                            thismtdcom, name, "method", name):
+                                    result, _, _ = sampcd_extract_and_run(
+                                        thismtdcom, name, "method", name)
+                                    if not result:
+                                        error_methods.append(name)
                                         process_result = False
+    else:
+        logger.warning('__all__ not found in file:%s', srcfile.name)
 
-    return process_result
+    return process_result, error_methods
 
 
 def test(file_list):
+    global methods  # readonly
     process_result = True
     for file in file_list:
         with open(file, 'r') as src:
-            if not srccoms_extract(src, wlist):
+            if not srccoms_extract(src, wlist, methods):
                 process_result = False
     return process_result
 
 
+def run_a_test(tc_filename):
+    """
+    execute a sample code-block.
+    """
+    global methods  # readonly
+    process_result = True
+    with open(tc_filename, 'r') as src:
+        process_result, error_methods = srccoms_extract(src, wlist, methods)
+    return process_result, tc_filename, error_methods
+
+
 def get_filenames():
     '''
     this function will get the modules that pending for check.
@@ -460,12 +544,12 @@ def get_filenames():
 
     '''
     filenames = []
-    global methods
+    global methods  # write
     global whl_error
     methods = []
     whl_error = []
     get_incrementapi()
-    API_spec = 'dev_pr_diff_api.spec'
+    API_spec = API_DIFF_SPEC_FN
     with open(API_spec) as f:
         for line in f.readlines():
             api = line.replace('\n', '')
@@ -474,17 +558,30 @@ def get_filenames():
             except AttributeError:
                 whl_error.append(api)
                 continue
+            except SyntaxError:
+                logger.warning('line:%s, api:%s', line, api)
+                # paddle.Tensor.<lambda>
+                continue
             if len(module.split('.')) > 1:
                 filename = '../python/'
+                # work for .so?
                 module_py = '%s.py' % module.split('.')[-1]
                 for i in range(0, len(module.split('.')) - 1):
                     filename = filename + '%s/' % module.split('.')[i]
                 filename = filename + module_py
             else:
                 filename = ''
-                print("\nWARNING:----Exception in get api filename----\n")
-                print("\n" + api + ' module is ' + module + "\n")
-            if filename != '' and filename not in filenames:
+                logger.warning("WARNING: Exception in getting api:%s module:%s",
+                               api, module)
+            if filename in filenames:
+                continue
+            elif not filename:
+                logger.warning('filename invalid: %s', line)
+                continue
+            elif not os.path.exists(filename):
+                logger.warning('file not exists: %s', filename)
+                continue
+            else:
                 filenames.append(filename)
             # get all methods
             method = ''
@@ -496,9 +593,9 @@ def get_filenames():
                 name = '%s.%s' % (api.split('.')[-2], api.split('.')[-1])
             else:
                 name = ''
-                print("\nWARNING:----Exception in get api methods----\n")
-                print("\n" + line + "\n")
-                print("\n" + api + ' method is None!!!' + "\n")
+                logger.warning(
+                    "WARNING: Exception when getting api:%s, line:%s", api,
+                    line)
             for j in range(2, len(module.split('.'))):
                 method = method + '%s.' % module.split('.')[j]
             method = method + name
@@ -508,26 +605,27 @@ def get_filenames():
     return filenames
 
 
+def get_api_md5(path):
+    api_md5 = {}
+    API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
+                          path)
+    with open(API_spec) as f:
+        for line in f.readlines():
+            api = line.split(' ', 1)[0]
+            md5 = line.split("'document', ")[1].replace(')', '').replace('\n',
+                                                                         '')
+            api_md5[api] = md5
+    return api_md5
+
+
 def get_incrementapi():
     '''
     this function will get the apis that difference between API_DEV.spec and API_PR.spec.
     '''
-
-    def get_api_md5(path):
-        api_md5 = {}
-        API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
-                              path)
-        with open(API_spec) as f:
-            for line in f.readlines():
-                api = line.split(' ', 1)[0]
-                md5 = line.split("'document', ")[1].replace(')', '').replace(
-                    '\n', '')
-                api_md5[api] = md5
-        return api_md5
-
-    dev_api = get_api_md5('paddle/fluid/API_DEV.spec')
-    pr_api = get_api_md5('paddle/fluid/API_PR.spec')
-    with open('dev_pr_diff_api.spec', 'w') as f:
+    global API_DEV_SPEC_FN, API_PR_SPEC_FN, API_DIFF_SPEC_FN  ## readonly
+    dev_api = get_api_md5(API_DEV_SPEC_FN)
+    pr_api = get_api_md5(API_PR_SPEC_FN)
+    with open(API_DIFF_SPEC_FN, 'w') as f:
         for key in pr_api:
             if key in dev_api:
                 if dev_api[key] != pr_api[key]:
@@ -538,7 +636,7 @@ def get_api_md5(path):
                 f.write('\n')
 
 
-def get_wlist():
+def get_wlist(fn="wlist.json"):
     '''
     this function will get the white list of API.
 
@@ -551,7 +649,7 @@ def get_wlist():
     wlist_file = []
     # only white on CPU
     gpu_not_white = []
-    with open("wlist.json", 'r') as load_f:
+    with open(fn, 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
             if key == 'wlist_dir':
@@ -567,31 +665,77 @@ def get_wlist():
     return wlist, wlist_file, gpu_not_white
 
 
-wlist, wlist_file, gpu_not_white = get_wlist()
+arguments = [
+    # flags, dest, type, default, help
+    ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
+    ['--logf', 'logf', str, None, 'file for logging'],
+    ['--threads', 'threads', int, 0, 'sub processes number'],
+]
 
-if len(sys.argv) < 2:
-    print("Error: inadequate number of arguments")
-    print('''If you are going to run it on 
-        "CPU: >>> python sampcd_processor.py cpu
-        "GPU: >>> python sampcd_processor.py gpu
-        ''')
-    sys.exit("lack arguments")
-else:
-    if sys.argv[1] == "gpu":
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    global arguments
+    parser = argparse.ArgumentParser(description='run Sample Code Test')
+    # parser.add_argument('--cpu', dest='cpu_mode', action="store_true",
+    #                     help='Use CPU mode (overrides --gpu)')
+    # parser.add_argument('--gpu', dest='gpu_mode', action="store_true")
+    parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument('mode', type=str, help='run on device', default='cpu')
+    for item in arguments:
+        parser.add_argument(
+            item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
+
+    if len(sys.argv) == 1:
+        args = parser.parse_args(['cpu'])
+        return args
+    #    parser.print_help()
+    #    sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.debug:
+        logger.setLevel(logging.DEBUG)
+    if args.logf:
+        logfHandler = logging.FileHandler(args.logf)
+        logfHandler.setFormatter(
+            logging.Formatter(
+                "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"
+            ))
+        logger.addHandler(logfHandler)
+
+    wlist, wlist_file, gpu_not_white = get_wlist()
+
+    if args.mode == "gpu":
+        GPU_ID = args.gpu_id
+        logger.info("using GPU_ID %d", GPU_ID)
         for _gnw in gpu_not_white:
             wlist.remove(_gnw)
-    elif sys.argv[1] != "cpu":
-        print("Unrecognized argument:'", sys.argv[1], "' , 'cpu' or 'gpu' is ",
-              "desired\n")
+    elif args.mode != "cpu":
+        logger.error("Unrecognized argument:%s, 'cpu' or 'gpu' is desired.",
+                     args.mode)
         sys.exit("Invalid arguments")
-    print("API check -- Example Code")
-    print("sample_test running under python", platform.python_version())
-    if not os.path.isdir("./samplecode_temp"):
-        os.mkdir("./samplecode_temp")
-    cpus = multiprocessing.cpu_count()
+    RUN_ON_DEVICE = args.mode
+    logger.info("API check -- Example Code")
+    logger.info("sample_test running under python %s",
+                platform.python_version())
+
+    if os.path.exists(SAMPLECODE_TEMPDIR):
+        if not os.path.isdir(SAMPLECODE_TEMPDIR):
+            os.remove(SAMPLECODE_TEMPDIR)
+            os.mkdir(SAMPLECODE_TEMPDIR)
+    else:
+        os.mkdir(SAMPLECODE_TEMPDIR)
+
     filenames = get_filenames()
     if len(filenames) == 0 and len(whl_error) == 0:
-        print("-----API_PR.spec is the same as API_DEV.spec-----")
+        logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
     rm_file = []
     for f in filenames:
@@ -600,51 +744,52 @@ def get_wlist():
                 rm_file.append(f)
                 filenames.remove(f)
     if len(rm_file) != 0:
-        print("REMOVE white files: %s" % rm_file)
-    print("API_PR is diff from API_DEV: %s" % filenames)
-    one_part_filenum = int(math.ceil(len(filenames) / cpus))
-    if one_part_filenum == 0:
-        one_part_filenum = 1
-    divided_file_list = [
-        filenames[i:i + one_part_filenum]
-        for i in range(0, len(filenames), one_part_filenum)
-    ]
-
-    po = multiprocessing.Pool()
-    results = po.map_async(test, divided_file_list)
+        logger.info("REMOVE white files: %s", rm_file)
+    logger.info("API_PR is diff from API_DEV: %s", filenames)
+
+    threads = multiprocessing.cpu_count()
+    if args.threads:
+        threads = args.threads
+    po = multiprocessing.Pool(threads)
+    # results = po.map_async(test, divided_file_list)
+    results = po.map_async(run_a_test, filenames)
     po.close()
     po.join()
 
     result = results.get()
 
     # delete temp files
-    for root, dirs, files in os.walk("./samplecode_temp"):
-        for fntemp in files:
-            os.remove("./samplecode_temp/" + fntemp)
-    os.rmdir("./samplecode_temp")
+    if not args.debug:
+        shutil.rmtree(SAMPLECODE_TEMPDIR)
 
-    print("----------------End of the Check--------------------")
+    logger.info("----------------End of the Check--------------------")
     if len(whl_error) != 0:
-        print("%s is not in whl." % whl_error)
-        print("")
-        print("Please check the whl package and API_PR.spec!")
-        print("You can follow these steps in order to generate API.spec:")
-        print("1. cd ${paddle_path}, compile paddle;")
-        print("2. pip install build/python/dist/(build whl package);")
-        print(
+        logger.info("%s is not in whl.", whl_error)
+        logger.info("")
+        logger.info("Please check the whl package and API_PR.spec!")
+        logger.info("You can follow these steps in order to generate API.spec:")
+        logger.info("1. cd ${paddle_path}, compile paddle;")
+        logger.info("2. pip install build/python/dist/(build whl package);")
+        logger.info(
             "3. run 'python tools/print_signatures.py paddle > paddle/fluid/API.spec'."
         )
         for temp in result:
-            if not temp:
-                print("")
-                print("In addition, mistakes found in sample codes.")
-                print("Please check sample codes.")
-        print("----------------------------------------------------")
+            if not temp[0]:
+                logger.info("In addition, mistakes found in sample codes: %s",
+                            temp[1])
+                logger.info("error_methods: %s", str(temp[2]))
+        logger.info("----------------------------------------------------")
         exit(1)
     else:
+        has_error = False
         for temp in result:
-            if not temp:
-                print("Mistakes found in sample codes.")
-                print("Please check sample codes.")
-                exit(1)
-    print("Sample code check is successful!")
+            if not temp[0]:
+                logger.info("In addition, mistakes found in sample codes: %s",
+                            temp[1])
+                logger.info("error_methods: %s", str(temp[2]))
+                has_error = True
+        if has_error:
+            logger.info("Mistakes found in sample codes.")
+            logger.info("Please check sample codes.")
+            exit(1)
+    logger.info("Sample code check is successful!")
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 6453eb48d7004c..5de4bffd1601c8 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -421,6 +421,8 @@
     'test_reader_reset',
     'test_recurrent_op',
     'test_reduce_op',
+    'test_reduce_mkldnn_op',
+    'test_reduce_bf16_mkldnn_op',
     'test_ref_by_trainer_id_op',
     'test_registry',
     'test_regularizer',
@@ -606,6 +608,7 @@
     'test_fusion_gru_bf16_mkldnn_op',
     'test_fusion_gru_mkldnn_op',
     'test_fusion_lstm_mkldnn_op',
+    'test_fusion_lstm_int8_mkldnn_op',
     'test_fusion_lstm_bf16_mkldnn_op',
     'test_gaussian_random_mkldnn_op',
     'test_lrn_mkldnn_op',
@@ -700,4 +703,5 @@
     'test_generate_proposals_v2_op',
     'test_lamb_op_xpu',
     'test_model_cast_to_bf16',
+    'test_sgd_op_bf16',
 ]
diff --git a/tools/static_mode_white_list.pyc b/tools/static_mode_white_list.pyc
deleted file mode 100644
index e9012c233595b6..00000000000000
Binary files a/tools/static_mode_white_list.pyc and /dev/null differ
diff --git a/tools/statistics_UT_resource.sh b/tools/statistics_UT_resource.sh
new file mode 100644
index 00000000000000..a6f1f264c4cd2b
--- /dev/null
+++ b/tools/statistics_UT_resource.sh
@@ -0,0 +1,58 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+unset GREP_OPTIONS
+rm ./run_detail.log
+rm ./UT_resource.log
+rm ./UT_resource_sort.log
+rm ./while_list.log
+
+export LD_LIBRARY_PATH="$PWD/python/paddle/libs;$LD_LIBRARY_PATH"
+export CUDA_VISIBLE_DEVICES=0,1
+
+test_cases=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
+use_memory_base=$(nvidia-smi -q -i 0  | grep "Used"  | head -1 | grep -o "[0-9]*")
+for unittest in $test_cases
+do
+    use_memory=0
+    gpu_utilization=0
+    memory_utilization=0
+    ctest -R "^${unittest}$" --repeat-until-fail 5 -j 1 &
+    PID=$!
+    echo -e "******************************************************"
+    echo -e "[$unittest]:    PID:$PID \n"
+    while [[ $(ps aux | awk '{print $2}' | grep "^$PID$" | grep -v "grep" | wc -l) -ge 1 ]]
+    do
+        use_memory_current=$(nvidia-smi -q -i 0  | grep "Used"  | head -1 | grep -o "[0-9]*")
+        if [[ $use_memory_current -gt $use_memory ]];then
+            use_memory=$use_memory_current
+        fi
+        memory_utilization_current=$(nvidia-smi -q -i 0 |  grep "Memory" | sed -n '3p' | grep -o "[0-9]*")
+        if [[ $memory_utilization_current -gt $memory_utilization ]];then
+            memory_utilization=$memory_utilization_current
+        fi
+
+        gpu_utilization_current=$(nvidia-smi -q -i 0  | grep "Gpu"  | grep -o "[0-9]*")
+        if [[ $gpu_utilization_current -gt $gpu_utilization ]];then
+            gpu_utilization=$gpu_utilization_current
+        fi
+    done
+    use_memory=`expr $use_memory - $use_memory_base`
+    echo -e "     use_memory:$use_memory \n     memory_utilization:$memory_utilization \n     gpu_utilization:$gpu_utilization\n"
+    echo -e "[$unittest]: \n     use_memory:$use_memory \n     memory_utilization:$memory_utilization \n     gpu_utilization:$gpu_utilization\n" >> run_detail.log
+    echo -e "$unittest:$use_memory:$memory_utilization:$gpu_utilization" >> UT_resource.log
+done
+
+sort -r -n -k 2 -t : UT_resource.log > UT_resource_sort.log
+cat UT_resource_sort.log | awk -F ':' '{print $1}' > while_list.log
diff --git a/tools/test_model_benchmark.sh b/tools/test_model_benchmark.sh
new file mode 100644
index 00000000000000..720bb334790697
--- /dev/null
+++ b/tools/test_model_benchmark.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+function compile_install_paddle {
+    export CUDA_ARCH_NAME=Auto
+    export PY_VERSION=3.7
+    export WITH_DISTRIBUTE=OFF
+    export WITH_GPU=ON
+    export WITH_TENSORRT=OFF
+    export WITH_TESTING=OFF
+    export WITH_UNITY_BUILD=ON
+    bash -x paddle/scripts/paddle_build.sh build
+    [ $? -ne 0 ] && echo "build paddle failed." && exit 1
+    pip uninstall -y paddlepaddle_gpu
+    pip install build/python/dist/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl
+    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
+}
+
+function prepare_data {
+    cd ${cache_dir}
+    if [ -d "benchmark_data" ];then 
+        echo -e "benchmark_data exist!"
+    else
+        mkdir benchmark_data
+        cd benchmark_data
+        mkdir dataset
+        cd dataset
+        wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/Bert.zip 
+        unzip Bert.zip
+        wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/imagenet100_data.zip
+        unzip imagenet100_data.zip
+    fi
+}
+
+function run_model_benchmark {
+    cd ${cache_dir}/benchmark_data
+    if [ -d "benchmark" ];then rm -rf benchmark
+    fi
+    git clone --recurse-submodules=PaddleClas --recurse-submodules=PaddleNLP https://github.com/paddlepaddle/benchmark.git
+    export data_path=${cache_dir}/benchmark_data/dataset
+    export BENCHMARK_ROOT=${cache_dir}/benchmark_data/benchmark
+    cd ${BENCHMARK_ROOT}/scripts/benchmark_ci
+    bash model_ci.sh
+}
+
+compile_install_paddle
+prepare_data
+run_model_benchmark
diff --git a/tools/test_op_benchmark.sh b/tools/test_op_benchmark.sh
index f0937ca7dfa2c5..a4c905196c2fe2 100644
--- a/tools/test_op_benchmark.sh
+++ b/tools/test_op_benchmark.sh
@@ -162,7 +162,6 @@ function compile_install_paddlepaddle {
   export BUILD_TYPE=Release
   export CUDA_ARCH_NAME=Auto
   export WITH_DISTRIBUTE=OFF
-  export PYTHON_ABI=cp37-cp37m
   export CMAKE_BUILD_TYPE=Release
   [ -d build ] && rm -rf build
   bash paddle/scripts/paddle_build.sh build $(nproc)
@@ -187,7 +186,7 @@ function run_op_benchmark_test {
   done
   # install tensorflow for testing accuary
   pip install tensorflow==2.3.0 tensorflow-probability
-  for branch_name in "develop" "test_pr"
+  for branch_name in "develop" "test"
   do
     git checkout $branch_name
     [ $? -ne 0 ] && LOG "[FATAL] Missing branch ${branch_name}." && exit 7
@@ -263,7 +262,7 @@ function summary_problems {
   done
   if [ $exit_code -ne 0 ]; then
     LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details."
-    LOG "[INFO] Or you can apply for one RD (GaoWei8(Recommend), Xreki, luotao1) approval to pass this PR."
+    LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR."
     exit $exit_code
   fi
 }
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
new file mode 100644
index 00000000000000..d8f47d1af5815d
--- /dev/null
+++ b/tools/test_sampcd_processor.py
@@ -0,0 +1,439 @@
+#! python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import tempfile
+import shutil
+import sys
+import importlib
+from sampcd_processor import find_all
+from sampcd_processor import check_indent
+from sampcd_processor import sampcd_extract_and_run
+from sampcd_processor import single_defcom_extract
+from sampcd_processor import srccoms_extract
+from sampcd_processor import get_api_md5
+from sampcd_processor import get_incrementapi
+from sampcd_processor import get_wlist
+
+
+class Test_find_all(unittest.TestCase):
+    def test_find_none(self):
+        self.assertEqual(0, len(find_all('hello', 'world')))
+
+    def test_find_one(self):
+        self.assertListEqual([0], find_all('hello', 'hello'))
+
+    def test_find_two(self):
+        self.assertListEqual([1, 15],
+                             find_all(' hello, world; hello paddle!', 'hello'))
+
+
+class Test_check_indent(unittest.TestCase):
+    def test_no_indent(self):
+        self.assertEqual(0, check_indent('hello paddle'))
+
+    def test_indent_4_spaces(self):
+        self.assertEqual(4, check_indent('    hello paddle'))
+
+    def test_indent_1_tab(self):
+        self.assertEqual(4, check_indent("\thello paddle"))
+
+
+class Test_sampcd_extract_and_run(unittest.TestCase):
+    def setUp(self):
+        if not os.path.exists('samplecode_temp/'):
+            os.mkdir('samplecode_temp/')
+
+    def test_run_a_defs_samplecode(self):
+        comments = """
+        Examples:
+            .. code-block:: python
+                print(1+1)
+        """
+        funcname = 'one_plus_one'
+        res, name, msg = sampcd_extract_and_run(comments, funcname)
+        self.assertTrue(res)
+        self.assertEqual(funcname, name)
+
+    def test_run_a_def_no_code(self):
+        comments = """
+        placeholder
+        """
+        funcname = 'one_plus_one'
+        res, name, msg = sampcd_extract_and_run(comments, funcname)
+        self.assertFalse(res)
+        self.assertEqual(funcname, name)
+
+    def test_run_a_def_raise_expection(self):
+        comments = """
+        placeholder
+        Examples:
+            .. code-block:: python
+                print(1/0)
+        """
+        funcname = 'one_plus_one'
+        res, name, msg = sampcd_extract_and_run(comments, funcname)
+        self.assertFalse(res)
+        self.assertEqual(funcname, name)
+
+
+class Test_single_defcom_extract(unittest.TestCase):
+    def test_extract_from_func(self):
+        defstr = '''
+import os
+def foo():
+            """
+            foo is a function.
+            """
+            pass
+def bar():
+            pass
+'''
+        comm = single_defcom_extract(
+            2, defstr.splitlines(True), is_class_begin=False)
+        self.assertEqual("            foo is a function.\n", comm)
+        pass
+
+    def test_extract_from_func_with_no_docstring(self):
+        defstr = '''
+import os
+def bar():
+            pass
+'''
+        comm = single_defcom_extract(
+            2, defstr.splitlines(True), is_class_begin=False)
+        self.assertEqual('', comm)
+        pass
+
+    def test_extract_from_class(self):
+        defstr = r'''
+import os
+class Foo():
+            """
+            Foo is a class.
+            second line.
+            """
+            pass
+            def bar():
+                pass
+def foo():
+            pass
+'''
+        comm = single_defcom_extract(
+            2, defstr.splitlines(True), is_class_begin=True)
+        rcomm = """            Foo is a class.
+            second line.
+"""
+        self.assertEqual(rcomm, comm)
+        pass
+
+    def test_extract_from_class_with_no_docstring(self):
+        defstr = '''
+import os
+class Foo():
+            pass
+            def bar():
+                pass
+def foo():
+            pass
+'''
+        comm = single_defcom_extract(
+            0, defstr.splitlines(True), is_class_begin=True)
+        self.assertEqual('', comm)
+
+
+class Test_get_api_md5(unittest.TestCase):
+    def setUp(self):
+        self.api_pr_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
+        with open(self.api_pr_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""",
+                """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of two_plus_two'))""",
+                """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of three_plus_three'))""",
+                """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of four_plus_four'))""",
+            ]))
+
+    def tearDown(self):
+        os.remove(self.api_pr_spec_filename)
+        pass
+
+    def test_get_api_md5(self):
+        res = get_api_md5('paddle/fluid/API_PR.spec')
+        self.assertEqual("'md5sum of one_plus_one'", res['one_plus_one'])
+        self.assertEqual("'md5sum of two_plus_two'", res['two_plus_two'])
+        self.assertEqual("'md5sum of three_plus_three'",
+                         res['three_plus_three'])
+        self.assertEqual("'md5sum of four_plus_four'", res['four_plus_four'])
+
+
+class Test_get_incrementapi(unittest.TestCase):
+    def setUp(self):
+        self.api_pr_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
+        with open(self.api_pr_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""",
+                """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of two_plus_two'))""",
+                """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of three_plus_three'))""",
+                """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of four_plus_four'))""",
+            ]))
+        self.api_dev_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_DEV.spec'))
+        with open(self.api_dev_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""",
+            ]))
+        self.api_diff_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "dev_pr_diff_api.spec"))
+
+    def tearDown(self):
+        os.remove(self.api_pr_spec_filename)
+        os.remove(self.api_dev_spec_filename)
+        os.remove(self.api_diff_spec_filename)
+
+    def test_it(self):
+        get_incrementapi()
+        with open(self.api_diff_spec_filename, 'r') as f:
+            lines = f.readlines()
+            self.assertCountEqual(
+                ["two_plus_two\n", "three_plus_three\n", "four_plus_four\n"],
+                lines)
+
+
+class Test_get_wlist(unittest.TestCase):
+    def setUp(self):
+        self.tmpDir = tempfile.mkdtemp()
+        self.wlist_filename = os.path.join(self.tmpDir, 'wlist.json')
+        with open(self.wlist_filename, 'w') as f:
+            f.write(r'''
+{
+    "wlist_dir":[
+        {
+            "name":"../python/paddle/fluid/contrib",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/verison.py",
+            "annotation":""
+        }
+    ],
+    "wlist_api":[
+        {
+            "name":"xxxxx",
+            "annotation":"not a real api, just for example"
+        }
+    ],
+    "wlist_temp_api":[
+        "to_tensor",
+        "save_persistables@dygraph/checkpoint.py"
+    ],
+    "gpu_not_white":[
+        "deformable_conv"
+    ]
+}
+''')
+
+    def tearDown(self):
+        os.remove(self.wlist_filename)
+        shutil.rmtree(self.tmpDir)
+
+    def test_get_wlist(self):
+        wlist, wlist_file, gpu_not_white = get_wlist(self.wlist_filename)
+        self.assertCountEqual(
+            ["xxxxx", "to_tensor",
+             "save_persistables@dygraph/checkpoint.py"], wlist)
+        self.assertCountEqual([
+            "../python/paddle/fluid/contrib",
+            "../python/paddle/verison.py",
+        ], wlist_file)
+        self.assertCountEqual(["deformable_conv"], gpu_not_white)
+
+
+class Test_srccoms_extract(unittest.TestCase):
+    def setUp(self):
+        self.tmpDir = tempfile.mkdtemp()
+        sys.path.append(self.tmpDir)
+        self.api_pr_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
+        with open(self.api_pr_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "one_plus_one"))""",
+                """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "two_plus_two"))""",
+                """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "three_plus_three"))""",
+                """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "four_plus_four"))""",
+            ]))
+
+    def tearDown(self):
+        sys.path.remove(self.tmpDir)
+        shutil.rmtree(self.tmpDir)
+        os.remove(self.api_pr_spec_filename)
+
+    def test_from_ops_py(self):
+        filecont = '''
+def add_sample_code(obj, docstr):
+    pass
+
+__unary_func__ = [
+    'exp',
+]
+
+__all__ = []
+__all__ += __unary_func__
+__all__ += ['one_plus_one']
+
+def exp():
+    pass
+add_sample_code(globals()["exp"], r"""
+Examples:
+    .. code-block:: python
+        import paddle
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.exp(x)
+        print(out)
+        # [0.67032005 0.81873075 1.10517092 1.34985881]
+""")
+
+def one_plus_one():
+            return 1+1
+
+one_plus_one.__doc__ = """
+            placeholder
+
+            Examples:
+            .. code-block:: python
+                print(1+1)
+"""
+
+__all__ += ['two_plus_two']
+def two_plus_two():
+            return 2+2
+add_sample_code(globals()["two_plus_two"], """
+            Examples:
+            .. code-block:: python
+                print(2+2)
+""")
+'''
+        pyfilename = os.path.join(self.tmpDir, 'ops.py')
+        with open(pyfilename, 'w') as pyfile:
+            pyfile.write(filecont)
+        self.assertTrue(os.path.exists(pyfilename))
+        utsp = importlib.import_module('ops')
+        print('testing srccoms_extract from ops.py')
+        methods = ['one_plus_one', 'two_plus_two', 'exp']
+        # os.remove("samplecode_temp/" "one_plus_one_example.py")
+        self.assertFalse(
+            os.path.exists("samplecode_temp/"
+                           "one_plus_one_example.py"))
+        with open(pyfilename, 'r') as pyfile:
+            res, error_methods = srccoms_extract(pyfile, [], methods)
+            self.assertTrue(res)
+        self.assertTrue(
+            os.path.exists("samplecode_temp/"
+                           "one_plus_one_example.py"))
+        os.remove("samplecode_temp/" "one_plus_one_example.py")
+        self.assertTrue(
+            os.path.exists("samplecode_temp/"
+                           "two_plus_two_example.py"))
+        os.remove("samplecode_temp/" "two_plus_two_example.py")
+        self.assertTrue(os.path.exists("samplecode_temp/" "exp_example.py"))
+        os.remove("samplecode_temp/" "exp_example.py")
+
+    def test_from_not_ops_py(self):
+        filecont = '''
+__all__ = [
+        'one_plus_one'
+]
+
+def one_plus_one():
+            """
+            placeholder
+
+            Examples:
+            .. code-block:: python
+                print(1+1)
+            """
+            return 1+1
+
+'''
+        pyfilename = os.path.join(self.tmpDir, 'opo.py')
+        with open(pyfilename, 'w') as pyfile:
+            pyfile.write(filecont)
+        utsp = importlib.import_module('opo')
+        methods = ['one_plus_one']
+        with open(pyfilename, 'r') as pyfile:
+            res, error_methods = srccoms_extract(pyfile, [], methods)
+            self.assertTrue(res)
+        self.assertTrue(
+            os.path.exists("samplecode_temp/"
+                           "one_plus_one_example.py"))
+        os.remove("samplecode_temp/" "one_plus_one_example.py")
+
+    def test_with_empty_wlist(self):
+        """
+        see test_from_ops_py
+        """
+        pass
+
+    def test_with_wlist(self):
+        filecont = '''
+__all__ = [
+        'four_plus_four',
+        'three_plus_three'
+        ]
+
+def four_plus_four():
+            """
+            placeholder
+
+            Examples:
+            .. code-block:: python
+                print(4+4)
+            """
+            return 4+4
+def three_plus_three():
+            """
+            placeholder
+
+            Examples:
+            .. code-block:: python
+                print(3+3)
+            """
+            return 3+3
+
+'''
+        pyfilename = os.path.join(self.tmpDir, 'three_and_four.py')
+        with open(pyfilename, 'w') as pyfile:
+            pyfile.write(filecont)
+        utsp = importlib.import_module('three_and_four')
+        methods = ['four_plus_four', 'three_plus_three']
+        with open(pyfilename, 'r') as pyfile:
+            res, error_methods = srccoms_extract(pyfile, ['three_plus_three'],
+                                                 methods)
+            self.assertTrue(res)
+        self.assertTrue(
+            os.path.exists("samplecode_temp/four_plus_four_example.py"))
+        os.remove("samplecode_temp/" "four_plus_four_example.py")
+        self.assertFalse(
+            os.path.exists("samplecode_temp/three_plus_three_example.py"))
+
+
+# https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/ops.py
+# why? unabled to use the ast module. emmmmm
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/windows/get_prec_ut_list.py b/tools/windows/get_prec_ut_list.py
new file mode 100644
index 00000000000000..2b89b978e459a9
--- /dev/null
+++ b/tools/windows/get_prec_ut_list.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""To get a list of prec ut """
+
+import sys
+
+
+def get_prec_ut_list(all_test_cases, prec_test_cases):
+    """Select the ut that needs to be executed"""
+    all_test_cases_list = all_test_cases.strip().split("\n")
+    prec_test_cases_list = prec_test_cases.strip().split("\n")
+    all_test_cases_list_new = [item.rstrip() for item in all_test_cases_list]
+    prec_test_cases_list_new = [item.rstrip() for item in prec_test_cases_list]
+
+    if len(prec_test_cases) == 0:
+        return "\n".join(all_test_cases_list)
+
+    case_to_run = ['test_prec_ut']
+    for case in all_test_cases_list_new:
+        if case in prec_test_cases_list_new:
+            case_to_run.append(case)
+        else:
+            print("{} will not run in PRECISION_TEST mode.".format(case))
+    for case in case_to_run:
+        print(case)
+
+
+if __name__ == '__main__':
+    # get prec cases lists
+    with open('ut_list', 'r') as f:
+        prec_test_cases = f.read()
+    all_test_cases = sys.argv[1]
+    #prec_test_cases = sys.argv[2]
+    get_prec_ut_list(all_test_cases, prec_test_cases)
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index dd4b21c80d910b..0aeea63d6ab283 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -16,6 +16,7 @@ set -e
 set +x
 NIGHTLY_MODE=$1
 PRECISION_TEST=$2
+WITH_GPU=$3
 
 export PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
 if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
@@ -36,6 +37,16 @@ else
     disable_ut_quickly=''
 fi
 
+# check added ut
+if [ ${WITH_GPU:-OFF} == "ON" ];then
+    set +e
+    cp $PADDLE_ROOT/tools/check_added_ut.sh $PADDLE_ROOT/tools/check_added_ut_win.sh
+    bash $PADDLE_ROOT/tools/check_added_ut_win.sh
+    rm -rf $PADDLE_ROOT/tools/check_added_ut_win.sh
+    set -e
+fi
+
+
 # /*==================Fixed Disabled Windows unittests==============================*/
 # TODO: fix these unittest that is bound to fail
 diable_wingpu_test="^lite_mul_model_test$|\
@@ -204,50 +215,37 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_strided_slice_op$|\
 ^test_transpose_op$"
 
-export FLAGS_call_stack_level=2
-export FLAGS_fraction_of_gpu_memory_to_use=0.92
-export CUDA_VISIBLE_DEVICES=0
-
-UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
-num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l)
-echo "Windows 1 card TestCases count is $num"
-if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
-    python ${PADDLE_ROOT}/tools/get_pr_ut.py
-    if [[ -f "ut_list" ]]; then
-        set +x
-        echo "PREC length: "`wc -l ut_list`
-        precision_cases=`cat ut_list`
-        set -x
-    fi
-fi
+if [ ${WITH_GPU:-OFF} == "ON" ];then
+    export FLAGS_call_stack_level=2
+    export FLAGS_fraction_of_gpu_memory_to_use=0.92
+    export CUDA_VISIBLE_DEVICES=0
 
-set +e
-if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
-    UT_list_prec=''
-    re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}')
-    for case in $UT_list; do
-        flag=$(echo $case|grep -oE $re)
-        if [ -n "$flag" ];then
-            if [ -z "$UT_list_prec" ];then
-                UT_list_prec=$case
-            else
-                UT_list_prec=$UT_list_prec'\n'$case
-            fi
-        else
-            echo $case "won't run in PRECISION_TEST mode."
+    UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
+    num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l)
+    echo "Windows 1 card TestCases count is $num"
+    if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+        python ${PADDLE_ROOT}/tools/get_pr_ut.py
+        if [[ -f "ut_list" ]]; then
+            echo "PREC length: "`wc -l ut_list`
+            precision_cases=`cat ut_list`
         fi
-    done
-    UT_list=$UT_list_prec
-fi
-set -e
+    fi
 
-output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
-eight_parallel_job=$(echo $output | cut -d ";" -f 1)
-tetrad_parallel_jog=$(echo $output | cut -d ";" -f 2)
-non_parallel_job=$(echo $output | cut -d ";" -f 3)
+    set +e
+    if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
+        UT_list_res=$(python ${PADDLE_ROOT}/tools/windows/get_prec_ut_list.py "$UT_list" )
+        UT_list_prec=$(echo "${UT_list_res}" | grep -v 'PRECISION_TEST')
+        echo "${UT_list_res}" | grep 'PRECISION_TEST'
+        UT_list=$UT_list_prec
+    fi
+    set -e
 
-non_parallel_job_1=$(echo $non_parallel_job | cut -d "," -f 1)
-non_parallel_job_2=$(echo $non_parallel_job | cut -d "," -f 2)
+    output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
+    cpu_parallel_job=$(echo $output | cut -d ";" -f 1)
+    tetrad_parallel_job=$(echo $output | cut -d ";" -f 2)
+    two_parallel_job=$(echo $output | cut -d ";" -f 3)
+    non_parallel_job=$(echo $output | cut -d ";" -f 4)
+fi
 
 failed_test_lists=''
 tmp_dir=`mktemp -d`
@@ -267,13 +265,20 @@ function collect_failed_tests() {
     set -e
 }
 
-function run_unittest() {
+function run_unittest_cpu() {
+    tmpfile=$tmp_dir/$RANDOM
+    (ctest -E "${disable_ut_quickly}" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
+    wait;
+}
+
+function run_unittest_gpu() {
     test_case=$1
     parallel_job=$2
+    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
     if [ "$2" == "" ]; then
-        parallel_job=1
+        parallel_job=$parallel_level_base
     else
-        parallel_job=$2
+        parallel_job=`expr $2 \* $parallel_level_base`
     fi
     echo "************************************************************************"
     echo "********These unittests run $parallel_job job each time with 1 GPU**********"
@@ -285,7 +290,11 @@ function run_unittest() {
 }
 
 function unittests_retry(){
-    parallel_job=1
+    if [ "${WITH_GPU:-OFF}" == "ON" ];then
+        parallel_job=1
+    else
+        parallel_job=4
+    fi
     is_retry_execuate=0
     wintest_error=1
     retry_time=3
@@ -336,7 +345,7 @@ function unittests_retry(){
 
 function show_ut_retry_result() {
     if [[ "$is_retry_execuate" != "0" ]];then
-        failed_test_lists_ult=`echo "${failed_test_lists}" | grep -Po '[^ ].*$'`
+        failed_test_lists_ult=`echo "${failed_test_lists}"`
         echo "========================================="
         echo "There are more than 10 failed unit tests, so no unit test retry!!!"
         echo "========================================="
@@ -349,7 +358,7 @@ function show_ut_retry_result() {
             echo "========================================"
             echo "There are failed tests, which have been successful after re-run:"
             echo "========================================"
-            echo "The following tests have been re-ran:"
+            echo "The following tests have been re-run:"
             echo "${retry_unittests_record}"
         else
             failed_ut_re=$(echo "${retry_unittests_record_judge}" | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"|"$1}} END{print all_str}')
@@ -365,10 +374,25 @@ function show_ut_retry_result() {
 }
 
 set +e
-run_unittest $eight_parallel_job 8
-run_unittest $tetrad_parallel_jog 4
-run_unittest $non_parallel_job_1
-run_unittest $non_parallel_job_2
+
+if [ "${WITH_GPU:-OFF}" == "ON" ];then
+    if [ -f "$PADDLE_ROOT/added_ut" ];then
+        added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
+        ctest -R "(${added_uts})" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$?
+        if [ "$added_ut_error" != 0 ];then
+            echo "========================================"
+            echo "Added UT should pass three additional executions"
+            echo "========================================"
+            exit 8;
+        fi
+    fi
+    run_unittest_gpu $cpu_parallel_job 12
+    run_unittest_gpu $tetrad_parallel_job 4
+    run_unittest_gpu $two_parallel_job 2
+    run_unittest_gpu $non_parallel_job
+else
+    run_unittest_cpu
+fi
 collect_failed_tests
 set -e
 rm -f $tmp_dir/*