diff --git a/CMakeLists.txt b/CMakeLists.txt
index 219f6fe20bafb..98772e9678153 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,8 @@ project(paddle CXX C)
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
+find_package(MKL CONFIG QUIET)
+option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
@@ -225,6 +227,7 @@ option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
 option(NEW_RELEASE_CUBIN   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
 option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup jit package"             OFF)
 option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
+option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -373,6 +376,10 @@ if (WITH_MIPS)
     add_definitions(-DPADDLE_WITH_MIPS)
 endif()
 
+if (WITH_ONEMKL)
+    add_definitions(-DPADDLE_WITH_ONEMKL)
+endif()
+
 if (WITH_HETERPS)
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake
index 928f573a4fb82..318f9f5fd3b5a 100644
--- a/cmake/FindGperftools.cmake
+++ b/cmake/FindGperftools.cmake
@@ -20,7 +20,7 @@
 find_library(GPERFTOOLS_TCMALLOC
   NAMES tcmalloc
   HINTS ${Gperftools_ROOT_DIR}/lib)
-
+  
 find_library(GPERFTOOLS_PROFILER
   NAMES profiler
   HINTS ${Gperftools_ROOT_DIR}/lib)
diff --git a/cmake/external/lapack.cmake b/cmake/external/lapack.cmake
new file mode 100644
index 0000000000000..c28c31c906aa2
--- /dev/null
+++ b/cmake/external/lapack.cmake
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE (ExternalProject)
+
+SET(LAPACK_PREFIX_DIR       ${THIRD_PARTY_PATH}/lapack)
+SET(LAPACK_SOURCE_DIR       ${THIRD_PARTY_PATH}/lapack/src/extern_lapack)
+SET(LAPACK_INSTALL_DIR      ${THIRD_PARTY_PATH}/install/lapack)
+SET(LAPACK_INCLUDE_DIR      ${LAPACK_SOURCE_DIR})
+SET(LAPACK_LIB_DIR          ${LAPACK_INSTALL_DIR}/lib)
+
+# Note(zhouwei): lapack need fortan compiler which many machines don't have, so use precompiled library.
+# use lapack tag v3.10.0 on 06/28/2021 https://github.com/Reference-LAPACK/lapack
+if(LINUX)
+    SET(LAPACK_VER  "lapack_lnx_v3.10.0.20210628" CACHE STRING "" FORCE)
+    SET(LAPACK_URL  "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz" CACHE STRING "" FORCE)
+    SET(LAPACK_URL_MD5          71f8cc8237a8571692f3e07f9a4f25f6)
+    SET(GNU_RT_LIB_1            "${LAPACK_LIB_DIR}/libquadmath.so.0")
+    SET(GFORTRAN_LIB            "${LAPACK_LIB_DIR}/libgfortran.so.3")
+    SET(BLAS_LIB                "${LAPACK_LIB_DIR}/libblas.so.3")
+    SET(LAPACK_LIB              "${LAPACK_LIB_DIR}/liblapack.so.3")
+elseif(WIN32)
+    # Refer to [lapack-for-windows] http://icl.cs.utk.edu/lapack-for-windows/lapack/#lapacke
+    SET(LAPACK_VER  "lapack_win_v3.10.0.20210628" CACHE STRING "" FORCE)
+    SET(LAPACK_URL  "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.zip" CACHE STRING "" FORCE)
+    SET(LAPACK_URL_MD5          590d080392dcd5abbd5dca767a50b63a)
+    SET(GNU_RT_LIB_1            "${LAPACK_LIB_DIR}/libquadmath-0.dll")
+    SET(GNU_RT_LIB_2            "${LAPACK_LIB_DIR}/libgcc_s_seh-1.dll")
+    SET(GFORTRAN_LIB            "${LAPACK_LIB_DIR}/libgfortran-3.dll")
+    SET(BLAS_LIB                "${LAPACK_LIB_DIR}/libblas.dll")
+    SET(LAPACK_LIB              "${LAPACK_LIB_DIR}/liblapack.dll")
+else()
+    SET(LAPACK_VER  "lapack_mac_v3.10.0.20210628" CACHE STRING "" FORCE)
+    SET(LAPACK_URL  "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz" CACHE STRING "" FORCE)
+    SET(LAPACK_URL_MD5          427aecf8dee8523de3566ca8e47944d7)
+    SET(GNU_RT_LIB_1            "${LAPACK_LIB_DIR}/libquadmath.0.dylib")
+    SET(GNU_RT_LIB_2            "${LAPACK_LIB_DIR}/libgcc_s.1.dylib")
+    SET(GFORTRAN_LIB            "${LAPACK_LIB_DIR}/libgfortran.5.dylib")
+    SET(BLAS_LIB                "${LAPACK_LIB_DIR}/libblas.3.dylib")
+    SET(LAPACK_LIB              "${LAPACK_LIB_DIR}/liblapack.3.dylib")
+endif()
+
+ExternalProject_Add(
+    extern_lapack
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL                     ${LAPACK_URL}
+    URL_MD5                 ${LAPACK_URL_MD5}
+    PREFIX                  ${LAPACK_PREFIX_DIR}
+    DOWNLOAD_DIR            ${LAPACK_SOURCE_DIR}
+    DOWNLOAD_NO_PROGRESS    1
+    PATCH_COMMAND           ""
+    UPDATE_COMMAND          ""
+    CONFIGURE_COMMAND       ""
+    BUILD_COMMAND           ""
+    INSTALL_COMMAND         ${CMAKE_COMMAND} -E copy_directory ${LAPACK_SOURCE_DIR} ${LAPACK_LIB_DIR}
+    BUILD_BYPRODUCTS        ${BLAS_LIB}
+    BUILD_BYPRODUCTS        ${LAPACK_LIB}
+)
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index e213068377b14..e344ebaa2477e 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -35,6 +35,14 @@ if (LITE_WITH_XPU)
   ENDIF()
 endif()
 
+if (LITE_WITH_NNADAPTER)
+  add_definitions(-DLITE_SUBGRAPH_WITH_NNADAPTER) 
+  if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
+    add_definitions(-DLITE_SUBGRAPH_WITH_NPU)
+    set(NPU_SDK_ROOT "/usr/local/Ascend/ascend-toolkit/latest" CACHE STRING "default NPU SDK ROOT")
+  endif()
+endif()
+
 if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   include(ExternalProject)
   set(LITE_PROJECT extern_lite)
@@ -42,7 +50,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG d3a3a6931b6d22d504d21ba32b3ae972770e9204)
+    set(LITE_GIT_TAG 4ab64daecc11fbf74fffdc6a4733f388472e7d5d)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
@@ -67,6 +75,9 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
                            -DLITE_WITH_XPU=${LITE_WITH_XPU}
                            -DXPU_SDK_URL=${XPU_BASE_URL}
                            -DXPU_SDK_ENV=${XPU_SDK_ENV}
+                           -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER}
+                           -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU}
+                           -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT}
                            -DLITE_WITH_CODE_META_INFO=OFF
                            -DLITE_WITH_ARM=ON)
     ExternalProject_Add(
@@ -110,6 +121,9 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
                            -DLITE_WITH_XPU=${LITE_WITH_XPU}
                            -DXPU_SDK_URL=${XPU_BASE_URL}
                            -DXPU_SDK_ENV=${XPU_SDK_ENV}
+                           -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER}
+                           -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU}
+                           -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT}
                            -DLITE_WITH_CODE_META_INFO=OFF
                            -DLITE_WITH_ARM=OFF)
 
@@ -120,6 +134,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
         GIT_TAG             ${LITE_GIT_TAG}
         PREFIX              ${LITE_SOURCES_DIR}
         UPDATE_COMMAND      ""
+        PATCH_COMMAND       sed -i "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?" ${LITE_SOURCES_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py && sed -i "/general::ssa::ConvertToSSA(cpp_prog)$<SEMICOLON>/d" ${LITE_SOURCES_DIR}/src/extern_lite/lite/model_parser/model_parser.cc
         BUILD_COMMAND       ${LITE_BUILD_COMMAND}
         INSTALL_COMMAND     ""
         CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -146,6 +161,11 @@ endif()
 if (WITH_ARM)
   if(LITE_WITH_XPU)
     set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8.xpu)
+  elseif(LITE_WITH_NNADAPTER)
+    message("Enable LITE_WITH_NNADAPTER")
+    if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
+      set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8.nnadapter)
+    endif()
   else()
     set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8)
   endif()
@@ -174,5 +194,16 @@ endfunction()
 external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
 set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
 
+if (LITE_WITH_NNADAPTER)
+  set(LITE_NNADAPTER_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so)
+  if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
+    external_lite_libs(lite_nnadapter ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so)
+    set(LITE_DEPS lite_full_static lite_nnadapter)
+    set(LITE_NNADAPTER_NPU_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so)
+  endif()
+else()
+  set(LITE_DEPS lite_full_static)
+endif()
+
 add_definitions(-DPADDLE_WITH_LITE)
 add_definitions(-DLITE_WITH_LOG)
diff --git a/cmake/external/pocketfft.cmake b/cmake/external/pocketfft.cmake
new file mode 100644
index 0000000000000..7323f67d115e1
--- /dev/null
+++ b/cmake/external/pocketfft.cmake
@@ -0,0 +1,44 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+
+set(POCKETFFT_PATH           "${THIRD_PARTY_PATH}/pocketfft" CACHE STRING "A path setting for external_pocketfft path.")
+set(POCKETFFT_PREFIX_DIR     ${POCKETFFT_PATH})
+
+set(POCKETFFT_REPOSITORY  https://gitlab.mpcdf.mpg.de/mtr/pocketfft.git)
+set(POCKETFFT_TAG         release_for_eigen)
+
+SET(POCKETFFT_INCLUDE_DIR  ${POCKETFFT_PREFIX_DIR}/src)
+message("POCKETFFT_INCLUDE_DIR is ${POCKETFFT_INCLUDE_DIR}")
+include_directories(${POCKETFFT_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_pocketfft
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  ${SHALLOW_CLONE}
+  GIT_REPOSITORY  ${POCKETFFT_REPOSITORY}
+  GIT_TAG         ${POCKETFFT_TAG}
+  PREFIX          ${POCKETFFT_PREFIX_DIR}
+  UPDATE_COMMAND    ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+add_library(pocketfft INTERFACE)
+
+add_dependencies(pocketfft extern_pocketfft)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 02abf08a99ce8..70bdc67980c03 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210909")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210921")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index aa31745c21340..44463f29923b2 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -210,9 +210,10 @@ include(external/threadpool)# download threadpool
 include(external/dlpack)    # download dlpack
 include(external/xxhash)    # download, build, install xxhash
 include(external/warpctc)   # download, build, install warpctc
+include(external/lapack)    # download, build, install lapack
 
 list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
-list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
+list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_lapack)
 
 include(cblas)              	# find first, then download, build, install openblas
 
@@ -361,4 +362,10 @@ if (WITH_CRYPTO)
     add_definitions(-DPADDLE_WITH_CRYPTO)
 endif (WITH_CRYPTO)
 
+if (WITH_POCKETFFT)
+    include(external/pocketfft)
+    list(APPEND third_party_deps extern_pocketfft)
+    add_definitions(-DPADDLE_WITH_POCKETFFT)
+endif (WITH_POCKETFFT)
+
 add_custom_target(third_party ALL DEPENDS ${third_party_deps})
diff --git a/log b/log
new file mode 100644
index 0000000000000..c02e10686b5fb
Binary files /dev/null and b/log differ
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 8708d90485af8..a42b686548c71 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -179,11 +179,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
 
   if ((in_format != out_format) || always_copy) {
     void* in_data = GetDataFromTensor(in, in_type);
-    std::string key =
-        platform::CreateKey(*dev_ctx, in_tz, in_format, out_format, in_type);
 
-    platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx,
-                                           cpu_engine, key);
+    platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type,
+                                           cpu_engine);
 
     auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data);
     auto reorder_dst_memory_p =
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 72ee126e13ce0..08749b6b7515b 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <iostream>
 #include <string>
 #include <typeindex>
 
@@ -170,11 +171,26 @@ extern inline proto::VarType::Type ToComplexType(proto::VarType::Type t) {
       return proto::VarType::COMPLEX128;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
-          "Unknown complex value data type (%s), now only support float32 and "
+          "Unknown real value data type (%s), now only support float32 and "
           "float64.",
           DataTypeToString(t)));
   }
 }
 
+extern inline proto::VarType::Type ToRealType(proto::VarType::Type t) {
+  switch (t) {
+    case proto::VarType::COMPLEX64:
+      return proto::VarType::FP32;
+    case proto::VarType::COMPLEX128:
+      return proto::VarType::FP64;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unknown complex value data type (%s), now only support complex64 "
+          "and "
+          "complex128.",
+          DataTypeToString(t)));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
index fe7d243066237..8bac8b7df6d2d 100644
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -107,6 +107,34 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
+DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims) {
+  PADDLE_ENFORCE_GE(src.size(), 3,
+                    platform::errors::InvalidArgument(
+                        "The rank of src dim should be at least 3 "
+                        "in flatten_to_3d, but received %d.",
+                        src.size()));
+  PADDLE_ENFORCE_EQ((num_row_dims >= 1 && num_row_dims < src.size()), true,
+                    platform::errors::InvalidArgument(
+                        "The num_row_dims should be inside [1, %d] "
+                        "in flatten_to_3d, but received %d.",
+                        src.size() - 1, num_row_dims));
+  PADDLE_ENFORCE_EQ((num_col_dims >= 2 && num_col_dims <= src.size()), true,
+                    platform::errors::InvalidArgument(
+                        "The num_col_dims should be inside [2, %d] "
+                        "in flatten_to_3d, but received %d.",
+                        src.size(), num_col_dims));
+  PADDLE_ENFORCE_GE(
+      num_col_dims, num_row_dims,
+      platform::errors::InvalidArgument(
+          "The num_row_dims should be less than num_col_dims in flatten_to_3d,"
+          "but received num_row_dims = %d, num_col_dims = %d.",
+          num_row_dims, num_col_dims));
+
+  return DDim({product(slice_ddim(src, 0, num_row_dims)),
+               product(slice_ddim(src, num_row_dims, num_col_dims)),
+               product(slice_ddim(src, num_col_dims, src.size()))});
+}
+
 DDim flatten_to_2d(const DDim& src, int num_col_dims) {
   return DDim({product(slice_ddim(src, 0, num_col_dims)),
                product(slice_ddim(src, num_col_dims, src.size()))});
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index e69fb4e761939..565e0b430dfdc 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -230,6 +230,13 @@ int arity(const DDim& ddim);
 
 std::ostream& operator<<(std::ostream&, const DDim&);
 
+/**
+* \brief Flatten dim to 3d
+* e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
+*       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
+*/
+DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims);
+
 // Reshape a tensor to a matrix. The matrix's first dimension(column length)
 // will be the product of tensor's first `num_col_dims` dimensions.
 DDim flatten_to_2d(const DDim& src, int num_col_dims);
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 1d78a650f905d..a485838a95942 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
 
-DEFINE_bool(
+PADDLE_DEFINE_EXPORTED_bool(
     cpu_deterministic, false,
     "Whether to make the result of computation deterministic in CPU side.");
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index dc5e24ef5de42..4346c144fab7f 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -1347,6 +1347,20 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) {
 #endif
 }
 
+void FleetWrapper::SetFileNumOneShard(const uint64_t table_id, int file_num) {
+#ifdef PADDLE_WITH_PSLIB
+  auto ret =
+      pslib_ptr_->_worker_ptr->set_file_num_one_shard(table_id, file_num);
+  ret.wait();
+  int32_t err_code = ret.get();
+  if (err_code == -1) {
+    LOG(ERROR) << "set_file_num_one_shard failed";
+  }
+#else
+  VLOG(0) << "FleetWrapper::SetFileNumOneShard does nothing when no pslib";
+#endif
+}
+
 double FleetWrapper::GetCacheThreshold(int table_id) {
 #ifdef PADDLE_WITH_PSLIB
   double cache_threshold = 0.0;
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index c1db06a298c86..d368b421ff2a0 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -266,6 +266,7 @@ class FleetWrapper {
                            bool load_combine);
 
   void PrintTableStat(const uint64_t table_id);
+  void SetFileNumOneShard(const uint64_t table_id, int file_num);
   // mode = 0, load all feature
   // mode = 1, load delta feature, which means load diff
   void LoadModel(const std::string& path, const int mode);
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index e69439892ca57..4eb40da1bfd39 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -215,7 +215,7 @@ class GlooWrapper {
 #else
     LOG(WARNING) << "AllGather does nothing when WITH_GLOO=OFF";
 #endif
-    return std::move(ret);
+    return ret;
   }
 
  protected:
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 175bd59133412..99c691e6cf6f7 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -43,6 +43,7 @@ cc_library(graph SRCS graph.cc DEPS node pretty_log)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
+cc_library(cost_model SRCS cost_model.cc DEPS executor graph profiler proto_desc device_tracer)
 
 SET(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
 if (WITH_TESTING)
@@ -141,6 +142,7 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
+cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass)
 cc_test(test_fc_fuse_pass_cc SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
index ffd80f0c90a1c..08e7c6f5b8689 100644
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
@@ -25,13 +25,14 @@ class VarDesc;
 }  // namespace framework
 }  // namespace paddle
 
-DEFINE_double(fuse_parameter_memory_size, -1.0,  // MBytes
-              "fuse_parameter_memory_size is up limited memory size(MB)"
-              "of one group parameters' gradient which is the input "
-              "of communication calling(e.g NCCLAllReduce). "
-              "The default value is 0, it means that "
-              "not set group according to memory_size.");
-DEFINE_int32(
+PADDLE_DEFINE_EXPORTED_double(
+    fuse_parameter_memory_size, -1.0,  // MBytes
+    "fuse_parameter_memory_size is up limited memory size(MB)"
+    "of one group parameters' gradient which is the input "
+    "of communication calling(e.g NCCLAllReduce). "
+    "The default value is 0, it means that "
+    "not set group according to memory_size.");
+PADDLE_DEFINE_EXPORTED_int32(
     fuse_parameter_groups_size, 1,
     "fuse_parameter_groups_size is the up limited size of one group "
     "parameters' gradient. "
diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
new file mode 100644
index 0000000000000..5027c50103a52
--- /dev/null
+++ b/paddle/fluid/framework/ir/cost_model.cc
@@ -0,0 +1,256 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cost_model.h"
+
+#include <memory>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+using ir::Graph;
+using platform::Event;
+using platform::MemEvent;
+
+const double CostData::NOT_MEASURED = -1;
+
+CostData::~CostData() {
+  // TODO(zhhsplendid): when we save a copy of program/graph, we should delete
+  // here.
+}
+
+double CostData::GetOpTimeMs(int op_id) const { return op_time_ms_.at(op_id); }
+double CostData::GetOpMemoryBytes(int op_id) const {
+  return op_memory_bytes_.at(op_id);
+}
+double CostData::GetWholeTimeMs() const { return whole_time_ms_; }
+double CostData::GetWholeMemoryBytes() const { return whole_memory_bytes_; }
+
+const Graph* CostData::GetGraph() const { return graph_; }
+const ProgramDesc* CostData::GetProgram() const { return program_; }
+
+bool CostData::SetCostData(const ProgramDesc& program,
+                           const std::vector<std::vector<Event>>& time_events) {
+  // TODO(zhhsplendid): Make a copy so that CostData can be available even if
+  // SWE changes Program, the copy can be saved into pointer program_
+  if (program.Size() == 0) {
+    whole_time_ms_ = 0;
+    whole_memory_bytes_ = 0;
+    return true;
+  }
+
+  if (time_events.empty()) {
+    LOG(WARNING) << "Input time_events for CostModel is empty";
+    return false;
+  }
+
+  std::vector<Event> main_thread_events = time_events[0];
+  // Support global block only
+  // TODO(zhhsplendid): support sub blocks
+  const BlockDesc& global_block = program.Block(0);
+  size_t op_size = global_block.OpSize();
+  if (op_size == 0) {
+    whole_time_ms_ = 0;
+    whole_memory_bytes_ = 0;
+    return true;
+  }
+
+  bool event_to_cost_success = true;
+  size_t event_index = 0;
+  for (size_t i = 0; i < op_size; ++i) {
+    const OpDesc* op_desc = global_block.Op(i);
+    std::string op_type = op_desc->Type();
+
+    while (event_index < main_thread_events.size()) {
+      if (main_thread_events[event_index].name() == op_type &&
+          main_thread_events[event_index].type() ==
+              platform::EventType::kPushRange) {
+        break;
+      }
+      ++event_index;
+    }
+    if (event_index >= main_thread_events.size()) {
+      LOG(WARNING) << "Input time_events for Op " << i << ", type '" << op_type
+                   << "' have wrong format, skip this Op.";
+      event_to_cost_success = false;
+      continue;
+    }
+    size_t op_push_index = event_index;
+
+    while (event_index < main_thread_events.size()) {
+      // Is it possible to Push a lot of Ops with same type and then Pop?
+      // ControlFlow Op can be like that, but this version only support global
+      // block
+      // TODO(zhhsplendid): make a more strict mapping between push and pop
+      if (main_thread_events[event_index].name() == op_type &&
+          main_thread_events[event_index].type() ==
+              platform::EventType::kPopRange) {
+        break;
+      }
+      ++event_index;
+    }
+    if (event_index >= main_thread_events.size()) {
+      LOG(WARNING) << "Input time_events for Op " << i << ", type '" << op_type
+                   << "' have wrong format, skip this Op.";
+      event_to_cost_success = false;
+      continue;
+    }
+    size_t op_pop_index = event_index;
+    double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs(
+        main_thread_events[op_pop_index]);
+    double gpu_time_ms = 0;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs(
+        main_thread_events[op_pop_index]);
+#endif
+    double time_ms = gpu_time_ms + cpu_time_ms;
+    op_time_ms_[i] = time_ms;
+  }
+
+  event_index = 0;
+  int start_profiler_idx = -1;
+  int stop_profiler_idx = -1;
+  while (event_index < main_thread_events.size()) {
+    if (main_thread_events[event_index].name() == "_start_profiler_") {
+      start_profiler_idx = event_index;
+    } else if (main_thread_events[event_index].name() == "_stop_profiler_") {
+      stop_profiler_idx = event_index;
+      break;
+    }
+    ++event_index;
+  }
+  if (start_profiler_idx != -1 && stop_profiler_idx != -1) {
+    double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs(
+        main_thread_events[stop_profiler_idx]);
+    double gpu_time_ms = 0;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs(
+        main_thread_events[stop_profiler_idx]);
+#endif
+    whole_time_ms_ = gpu_time_ms + cpu_time_ms;
+  } else {
+    LOG(WARNING) << "Input time_events for whole time have wrong format";
+    event_to_cost_success = false;
+  }
+
+  return event_to_cost_success;
+}
+
+void PrintEvents(const std::vector<std::vector<Event>>* time_events,
+                 const std::vector<std::vector<MemEvent>>* mem_events) {
+  if (time_events != nullptr) {
+    for (size_t i = 0; i < time_events->size(); ++i) {
+      for (size_t j = 0; j < (*time_events)[i].size(); ++j) {
+        VLOG(4) << "Print time event (" << i << ", " << j << ")" << std::endl;
+        VLOG(4) << (*time_events)[i][j].name() << " "
+                << (*time_events)[i][j].attr() << std::endl;
+        VLOG(4) << "This: " << &(*time_events)[i][j]
+                << ", Parent: " << (*time_events)[i][j].parent() << std::endl;
+        if ((*time_events)[i][j].role() == platform::EventRole::kInnerOp) {
+          VLOG(4) << "role kInnerOp" << std::endl;
+        } else if ((*time_events)[i][j].role() ==
+                   platform::EventRole::kUniqueOp) {
+          VLOG(4) << "role kUniqueOp" << std::endl;
+        } else if ((*time_events)[i][j].role() ==
+                   platform::EventRole::kOrdinary) {
+          VLOG(4) << "role kOrdinary" << std::endl;
+        } else if ((*time_events)[i][j].role() ==
+                   platform::EventRole::kSpecial) {
+          VLOG(4) << "role kSpecial" << std::endl;
+        }
+
+        if ((*time_events)[i][j].type() == platform::EventType::kPopRange) {
+          VLOG(4) << "type kPopRange" << std::endl;
+        } else if ((*time_events)[i][j].type() ==
+                   platform::EventType::kPushRange) {
+          VLOG(4) << "type kPushRange" << std::endl;
+        } else if ((*time_events)[i][j].type() == platform::EventType::kMark) {
+          VLOG(4) << "type kMark" << std::endl;
+        }
+        VLOG(4) << std::endl;
+      }
+    }
+  }
+  if (mem_events != nullptr) {
+    for (size_t i = 0; i < mem_events->size(); ++i) {
+      for (size_t j = 0; j < (*mem_events)[i].size(); ++j) {
+        VLOG(4) << "Print mem event (" << i << ", " << j << ")" << std::endl;
+        VLOG(4) << (*mem_events)[i][j].annotation() << std::endl;
+      }
+    }
+  }
+}
+
+std::string ToLowerCopy(const std::string& in) {
+  std::string out(in);
+  std::transform(out.begin(), out.end(), out.begin(),
+                 [](unsigned char c) { return std::tolower(c); });
+  return out;
+}
+
+CostData CostModel::ProfileMeasure(
+    const ProgramDesc& main_program, const ProgramDesc& startup_program,
+    const std::string& device,
+    const std::vector<std::string>& fetch_cost_list) const {
+  // Currently fetch_cost_list is useless
+  // TODO(zhhsplendid): support different fetch data
+
+  platform::ProfilerState profiler_state;
+  platform::Place place;
+
+  std::string device_lower_case = ToLowerCopy(device);
+  if (device_lower_case == "cpu") {
+    profiler_state = platform::ProfilerState::kCPU;
+    place = platform::CPUPlace();
+  } else if (device_lower_case == "gpu") {
+    profiler_state = platform::ProfilerState::kAll;
+    place = platform::CUDAPlace();
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Not support %s in CostModel now", device));
+  }
+
+  Executor executor(place);
+  Scope scope;
+  executor.Run(startup_program, &scope, /*block_id = */ 0);
+
+  // TODO(zhhsplendid): handle the case that Profiler is already enabled
+  SetTracerOption(platform::TracerOption::kAllOpDetail);
+  EnableProfiler(profiler_state);
+  executor.Run(main_program, &scope, /*block_id = */ 0);
+
+  std::unique_ptr<std::vector<std::vector<Event>>> time_events(
+      new std::vector<std::vector<Event>>());
+  std::unique_ptr<std::vector<std::vector<MemEvent>>> mem_events(
+      new std::vector<std::vector<MemEvent>>());
+
+  CompleteProfilerEvents(/*tracer_profile= */ nullptr, time_events.get(),
+                         mem_events.get());
+
+  // TODO(zhhsplendid): remove debug vlog after this series of work
+  PrintEvents(time_events.get(), mem_events.get());
+
+  // Convert events to cost data
+  CostData cost_data;
+  cost_data.SetCostData(main_program, *time_events);
+
+  return cost_data;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cost_model.h b/paddle/fluid/framework/ir/cost_model.h
new file mode 100644
index 0000000000000..41567df2cb332
--- /dev/null
+++ b/paddle/fluid/framework/ir/cost_model.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+class CostData {
+ public:
+  CostData() {}
+
+  ~CostData();
+
+  // Support global block only
+  // TODO(zhhsplendid): add support for sub-block
+  double GetOpTimeMs(int op_id) const;
+  double GetOpMemoryBytes(int op_id) const;
+  double GetWholeTimeMs() const;
+  double GetWholeMemoryBytes() const;
+
+  const ir::Graph* GetGraph() const;
+  const ProgramDesc* GetProgram() const;
+
+  // Support Time Event only
+  // TODO(zhhsplendid): add memory
+  bool SetCostData(
+      const ProgramDesc& program,
+      const std::vector<std::vector<platform::Event>>& time_events);
+
+  static const double NOT_MEASURED;
+
+ private:
+  ir::Graph* graph_{nullptr};
+  ProgramDesc* program_{nullptr};
+  std::map<int, double> op_time_ms_;  // from Op Node id to time
+  std::map<int, double>
+      op_memory_bytes_;         // from Op Node id to total memory bytes
+  std::map<int, double> comm_;  // from Op Node id to communicate cost
+  double whole_time_ms_{
+      NOT_MEASURED};  // time cost of the whole program or graph
+  double whole_memory_bytes_{
+      NOT_MEASURED};  // memory cost of the whole program or graph
+  double whole_comm_{
+      NOT_MEASURED};  // communication cost of the whole program or graph
+};
+
+class CostModel {
+ public:
+  CostModel() {}
+  ~CostModel() {}
+
+  CostData ProfileMeasure(
+      const ProgramDesc& main_program, const ProgramDesc& startup_program,
+      const std::string& device,
+      const std::vector<std::string>& fetch_cost_list) const;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cost_model_test.cc b/paddle/fluid/framework/ir/cost_model_test.cc
new file mode 100644
index 0000000000000..57f3904d845c8
--- /dev/null
+++ b/paddle/fluid/framework/ir/cost_model_test.cc
@@ -0,0 +1,209 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cost_model.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/event.h"
+
+namespace paddle {
+namespace framework {
+
+// Register test op
+class FakeTestOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddInput("Y", "").AsDuplicable();
+    AddOutput("Out", "").AsDuplicable();
+    AddComment("");
+  }
+};
+
+class FakeTestOp : public OperatorBase {
+ public:
+  FakeTestOp(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs, const AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope &scope,
+               const platform::Place &place) const override {
+    // Fake RunImpl, for test only
+    Variable *var = scope.FindVar("X");
+    if (var != nullptr) {
+      LoDTensor *tensor = var->GetMutable<LoDTensor>();
+      tensor->mutable_data<float>(place);
+    }
+    int count = 0;
+    while (count <= 1000) {
+      ++count;
+    }
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(fake_test_op, paddle::framework::FakeTestOp,
+                  paddle::framework::FakeTestOpMaker);
+
+namespace paddle {
+namespace framework {
+
+ProgramDesc CreateTestProgram() {
+  // create a ProgramDesc:
+  //   Z = fake_test_op(X, Y)
+  //   Out = fake_test_op(Z, W)
+  ProgramDesc program;
+  auto *global_block = program.MutableBlock(0);
+
+  auto *x = global_block->Var("X");
+  x->SetType(proto::VarType::LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(proto::VarType::FP32);
+  x->SetShape({1000, 784});
+
+  auto *y = global_block->Var("Y");
+  y->SetType(proto::VarType::LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(proto::VarType::FP32);
+  y->SetShape({784, 100});
+
+  auto *op0 = global_block->AppendOp();
+  op0->SetType("fake_test_op");
+  op0->SetInput("X", {x->Name()});
+  op0->SetInput("Y", {y->Name()});
+
+  auto *z = global_block->Var("Z");
+  z->SetType(proto::VarType::LOD_TENSOR);
+  op0->SetOutput("Out", {z->Name()});
+
+  auto *w = global_block->Var("W");
+  w->SetType(proto::VarType::LOD_TENSOR);
+  w->SetLoDLevel(0);
+  w->SetDataType(proto::VarType::FP32);
+  w->SetShape({100, 10});
+
+  auto *op1 = global_block->AppendOp();
+  op1->SetType("fake_test_op");
+  op1->SetInput("X", {z->Name()});
+  op1->SetInput("Y", {w->Name()});
+
+  auto *out = global_block->Var("Out");
+  out->SetType(proto::VarType::LOD_TENSOR);
+  op1->SetOutput("Out", {out->Name()});
+  return program;
+}
+
+TEST(CostModelTest, TestProfileMeasure_EmptyProgram) {
+  CostModel cost_model;
+  ProgramDesc empty_program;
+  CostData cost_data =
+      cost_model.ProfileMeasure(empty_program, empty_program, "cpu", {"time"});
+  EXPECT_EQ(cost_data.GetWholeTimeMs(), 0);
+}
+
+TEST(CostModelTest, TestProfileMeasure_Program) {
+  CostModel cost_model;
+  ProgramDesc program = CreateTestProgram();
+  ProgramDesc empty_program;
+  CostData cost_data =
+      cost_model.ProfileMeasure(program, empty_program, "cpu", {"time"});
+  double op0_time_ms = cost_data.GetOpTimeMs(0);
+  double op1_time_ms = cost_data.GetOpTimeMs(1);
+  EXPECT_GT(op0_time_ms, 0);
+  EXPECT_GT(op1_time_ms, 0);
+  EXPECT_GT(cost_data.GetWholeTimeMs(), op0_time_ms + op1_time_ms);
+}
+
+TEST(CostModelTest, TestProfileMeasure_UnsupportedDevice) {
+  CostModel cost_model;
+  ProgramDesc program = CreateTestProgram();
+  ProgramDesc empty_program;
+
+  EXPECT_THROW(cost_model.ProfileMeasure(program, empty_program, "wrong_device",
+                                         {"time"}),
+               paddle::platform::EnforceNotMet);
+}
+
+TEST(CostDataTest, TestGetGraphProgram) {
+  CostData cost_data;
+  EXPECT_EQ(cost_data.GetGraph(), nullptr);
+  EXPECT_EQ(cost_data.GetProgram(), nullptr);
+}
+
+TEST(CostDataTest, TestUninitailzed) {
+  CostData cost_data;
+  EXPECT_EQ(cost_data.GetWholeMemoryBytes(), CostData::NOT_MEASURED);
+  EXPECT_EQ(cost_data.GetWholeTimeMs(), CostData::NOT_MEASURED);
+}
+
+TEST(CostDataTest, TestEmptyProgram) {
+  CostData cost_data;
+  ProgramDesc empty_program("");
+  EXPECT_EQ(cost_data.SetCostData(empty_program, {}), true);
+  EXPECT_EQ(cost_data.GetWholeMemoryBytes(), 0);
+  EXPECT_EQ(cost_data.GetWholeTimeMs(), 0);
+}
+
+TEST(CostDataTest, TestEmptyTimeEvent) {
+  CostData cost_data;
+  ProgramDesc program = CreateTestProgram();
+  EXPECT_EQ(cost_data.SetCostData(program, {}), false);
+  EXPECT_EQ(cost_data.GetWholeMemoryBytes(), CostData::NOT_MEASURED);
+  EXPECT_EQ(cost_data.GetWholeTimeMs(), CostData::NOT_MEASURED);
+}
+
+TEST(CostDataTest, TestNoOpEvent) {
+  CostData cost_data;
+  ProgramDesc program = CreateTestProgram();
+  std::vector<platform::Event> thread_events;
+  thread_events.push_back(
+      platform::Event(platform::EventType::kPushRange, "not exist name", 0));
+  std::vector<std::vector<platform::Event>> time_events{thread_events};
+  EXPECT_EQ(cost_data.SetCostData(program, time_events), false);
+}
+
+TEST(CostDataTest, TestNoOpPopEvent) {
+  CostData cost_data;
+  ProgramDesc program = CreateTestProgram();
+  std::vector<platform::Event> thread_events;
+  thread_events.push_back(
+      platform::Event(platform::EventType::kPushRange, "fake_test_op", 0));
+  std::vector<std::vector<platform::Event>> time_events{thread_events};
+  EXPECT_EQ(cost_data.SetCostData(program, time_events), false);
+}
+
+TEST(CostDataTest, TestNoWholeEvent) {
+  CostData cost_data;
+  ProgramDesc program = CreateTestProgram();
+  std::vector<platform::Event> thread_events;
+  thread_events.push_back(
+      platform::Event(platform::EventType::kPushRange, "fake_test_op", 0));
+  thread_events.push_back(
+      platform::Event(platform::EventType::kPopRange, "fake_test_op", 0));
+  thread_events.push_back(
+      platform::Event(platform::EventType::kPushRange, "fake_test_op", 0));
+  thread_events.push_back(
+      platform::Event(platform::EventType::kPopRange, "fake_test_op", 0));
+  std::vector<std::vector<platform::Event>> time_events{thread_events};
+  EXPECT_EQ(cost_data.SetCostData(program, time_events), false);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
index 4ce91999207a2..b9cc337df8792 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
@@ -92,7 +92,6 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
     int range = ((1 << (bit_length - 1)) - 1);
     std::vector<float> weight_scale;
     std::string quant_dequant_op_out_name = quant_dequant_op_out->Var()->Name();
-
     auto* any_op2_desc = any_op2->Op();
     auto var_map = any_op2_desc->Inputs();
     std::string arg_name = "";
@@ -106,43 +105,52 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
     PADDLE_ENFORCE_GT(arg_name.size(), 0, platform::errors::InvalidArgument(
                                               "can not find the input %s.",
                                               quant_dequant_op_out_name));
-    any_op2_desc->SetAttr("enable_int8", true);
+    // any_op2_desc->SetAttr("enable_int8", true);
     any_op2_desc->SetAttr("bit_length", bit_length);
+
     // modify the any_op2's inputs
-    any_op2_desc->Flush();
     auto dequant_type = quant_dequant_op->Op()->Type();
-    auto quantized_op_type = any_op2_desc->Type();
+
     // get weight tensor
     auto* weight_tensor =
         scope->GetVar(quant_dequant_op_x->Name())->GetMutable<LoDTensor>();
     auto w_dims = weight_tensor->dims();
+
     float* quantized_weight_data =
         weight_tensor->mutable_data<float>(platform::CPUPlace());
 
     // Get weight scale
     if (dequant_type == "fake_channel_wise_quantize_dequantize_abs_max") {
-      auto scales_name = quant_dequant_op->Op()->Output("OutScale");
+      int quant_axis =
+          BOOST_GET_CONST(int, quant_dequant_op->Op()->GetAttr("quant_axis"));
+      PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                        platform::errors::InvalidArgument(
+                            "'quant_axis' should be 0 or 1, but "
+                            "the received is %d",
+                            quant_axis));
+
+      // To Do @Wangzheee: use "OutScale" to quantdequant
+      /*auto scales_name = quant_dequant_op->Op()->Output("OutScale");
       PADDLE_ENFORCE_EQ(scales_name.size(), 1,
                         platform::errors::InvalidArgument(
                             "Scales size in channel-wise quant dequantize op "
                             "should be 1, got %d.",
                             scales_name.size()));
       const LoDTensor& channel_scale_tensor =
-          scope->GetVar(scales_name[0])->Get<LoDTensor>();
+          scope->FindVar(scales_name[0])->Get<LoDTensor>();
       PADDLE_ENFORCE(
           paddle::platform::is_cpu_place(channel_scale_tensor.place()),
           platform::errors::InvalidArgument(
               "Channel scale tensor's place should be CPU."));
       // compute the channel wise abs max of the weight tensor
-      int quant_axis =
-          BOOST_GET_CONST(int, quant_dequant_op->Op()->GetAttr("quant_axis"));
 
-      PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
-                        platform::errors::InvalidArgument(
-                            "'quant_axis' should be 0 or 1, but "
-                            "the received is %d",
-                            quant_axis));
+      const float* channel_scale_data = channel_scale_tensor.data<float>();
+      for (int i = 0; i < channel_scale_tensor.numel(); i++) {
+        weight_scale.push_back(channel_scale_data[i] );
+      }*/
 
+      // Implement channel_wise_quantize_dequantize_abs_max quantization
+      // algorithm
       const int64_t channel = w_dims[quant_axis];
       weight_scale.resize(channel, 0);
       if (quant_axis == 0) {
@@ -171,11 +179,10 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
         PADDLE_ENFORCE_NE(weight_scale[i], 0,
                           platform::errors::InvalidArgument(
                               "Weight scale should be nonzero, but get zero."));
-        weight_scale[i] = range / weight_scale[i];
+        weight_scale[i] = weight_scale[i] / range;
       }
     } else {
-      auto scale_name = quant_dequant_op_outscale->Name();
-      // compute the abs max of the weight tensor
+      // Implement quantize_dequantize_abs_max quantization algorithm
       float abs_max_weight = 0.;
       for (int j = 0; j < weight_tensor->numel(); j++) {
         abs_max_weight =
@@ -184,113 +191,10 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
       PADDLE_ENFORCE_NE(abs_max_weight, 0,
                         platform::errors::InvalidArgument(
                             "Weight scale should be nonzero, but get zero"));
-      weight_scale.push_back((range * range) / abs_max_weight / range);
+      weight_scale.push_back(abs_max_weight / range);
     }
 
     nodes2rm.insert(quant_dequant_op_outscale);
-
-    // perform quantize dequantize operations
-    // If quantized op is not channel wise, weight scale size = 1;
-    // If quantized op is conv2d, weight scale size = weight dims[0]
-    // If quantized op is conv2d_transpose, weight scale size = weight dims[1]
-    if (dequant_type == "fake_quantize_dequantize_abs_max") {
-      PADDLE_ENFORCE_EQ(
-          weight_scale.size(), 1,
-          platform::errors::InvalidArgument(
-              "%s op weight dequantized by [fake_quantize_dequantize_max_abs] "
-              "requires weight scale size = 1, but got %d.",
-              quantized_op_type, weight_scale.size()));
-      for (int j = 0; j < weight_tensor->numel(); j++) {
-        // quantized
-        quantized_weight_data[j] = quantized_weight_data[j] * weight_scale[0];
-        quantized_weight_data[j] = std::round(quantized_weight_data[j]);
-        // dequantized
-        quantized_weight_data[j] /= weight_scale[0];
-      }
-    } else if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
-               quantized_op_type == "fc") {
-      if (dequant_type == "fake_channel_wise_quantize_dequantize_abs_max") {
-        PADDLE_ENFORCE_EQ(
-            weight_scale.size(), static_cast<size_t>(w_dims[1]),
-            platform::errors::InvalidArgument(
-                "mul op weight dequantized by "
-                "[fake_channel_wise_quantize_dequantize_abs_max] requires "
-                "weight scale "
-                "size = 2nd dim of mul's weight, which is %zu, but got %zu.",
-                static_cast<size_t>(w_dims[1]), weight_scale.size()));
-        for (int j = 0; j < weight_tensor->numel(); j++) {
-          // quantized
-          PADDLE_ENFORCE_NE(
-              weight_scale[j % w_dims[1]], 0,
-              platform::errors::InvalidArgument(
-                  "fc op weight scale should be nonzero, but get zero"));
-          quantized_weight_data[j] =
-              quantized_weight_data[j] * weight_scale[j % w_dims[1]];
-          quantized_weight_data[j] = std::round(quantized_weight_data[j]);
-          // dequantized
-          quantized_weight_data[j] /= weight_scale[j % w_dims[1]];
-        }
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unsupported quantized op type: %s", quantized_op_type));
-      }
-    } else if (quantized_op_type == "conv2d" ||
-               quantized_op_type == "depthwise_conv2d") {
-      if (dequant_type == "fake_channel_wise_quantize_dequantize_abs_max") {
-        PADDLE_ENFORCE_EQ(
-            weight_scale.size(), static_cast<size_t>(w_dims[0]),
-            platform::errors::InvalidArgument(
-                "conv2d op requires weight scale size = channel size of the "
-                "weight, which is %zu, but got %zu.",
-                static_cast<size_t>(w_dims[0]), weight_scale.size()));
-        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
-        for (int j = 0; j < weight_tensor->numel(); j++) {
-          // quantized
-          PADDLE_ENFORCE_NE(
-              weight_scale[j / inner_size], 0,
-              platform::errors::InvalidArgument(
-                  "conv2d op weight scale should be nonzero, but get zero"));
-          quantized_weight_data[j] =
-              quantized_weight_data[j] * weight_scale[j / inner_size];
-          quantized_weight_data[j] = std::round(quantized_weight_data[j]);
-          // dequantized
-          quantized_weight_data[j] /= weight_scale[j / inner_size];
-        }
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unsupported quantized op type: %s", quantized_op_type));
-      }
-    } else if (quantized_op_type == "conv2d_transpose") {
-      if (dequant_type == "fake_channel_wise_quantize_dequantize_abs_max") {
-        PADDLE_ENFORCE_EQ(
-            weight_scale.size(), static_cast<size_t>(w_dims[0]),
-            platform::errors::InvalidArgument(
-                "conv2d_transpose op requires weight scale size = channel size "
-                "of the "
-                "weight, which is %zu, but got %zu.",
-                static_cast<size_t>(w_dims[1]), weight_scale.size()));
-        int inner_size = w_dims[2] * w_dims[3];
-        for (int j = 0; j < weight_tensor->numel(); j++) {
-          // quantized
-          PADDLE_ENFORCE_NE(weight_scale[(j / inner_size) % w_dims[1]], 0,
-                            platform::errors::InvalidArgument(
-                                "conv2d_transpose op weight scale should be "
-                                "nonzero, but get zero"));
-          quantized_weight_data[j] = quantized_weight_data[j] *
-                                     weight_scale[(j / inner_size) % w_dims[1]];
-          quantized_weight_data[j] = std::round(quantized_weight_data[j]);
-          // dequantized
-          quantized_weight_data[j] /=
-              weight_scale[(j / inner_size) % w_dims[1]];
-        }
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unsupported quantized op type: %s", quantized_op_type));
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported quantized op type: %s", quantized_op_type));
-    }
     nodes2rm.insert(quant_dequant_op_out);
 
     // link weight in quant_dequant_op_x to any_op2
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index 65e8b8fc80d10..b99f2266f39b2 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -28,76 +28,85 @@ namespace ir {
 
 #define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
 #define GET_NODES                         \
-  GET_IR_NODE(any_op_out);                \
   GET_IR_NODE(quant_dequant_op_inscale);  \
   GET_IR_NODE(quant_dequant_op);          \
   GET_IR_NODE(quant_dequant_op_outscale); \
-  GET_IR_NODE(quant_dequant_op_out);      \
-  GET_IR_NODE(any_op2);
+  GET_IR_NODE(quant_dequant_op_out);
 
 void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "delete_quantdequant_op_pattern";
   FusePassBase::Init(pattern_name, graph);
-
   GraphPatternDetector gpd;
 
+  std::string quantdequant_types =
+      "fake_quantize_dequantize_moving_average_abs_max";
+
+  auto* input_node = gpd.mutable_pattern()
+                         ->NewNode("input_node")
+                         ->assert_is_op_input(quantdequant_types, "X")
+                         ->AsInput();
+
   patterns::DeleteQuantDequantOpPattern pattern(gpd.mutable_pattern(),
                                                 pattern_name);
-  pattern();
+  pattern(input_node, quantdequant_types);
   auto* scope = param_scope();
+  int found_count = 0;
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
+    PADDLE_ENFORCE_EQ(
+        subgraph.count(input_node), true,
+        platform::errors::NotFound(
+            "Input act node(%s) not found in QuantDequantFuse pass.",
+            input_node->name()));
+    Node* input = subgraph.at(input_node);
     GET_NODES;
-    IR_NODE_LINK_TO(any_op_out, any_op2);
-    std::string any_op_out_name = any_op_out->Var()->Name();
-    std::string quant_dequant_op_out_name = quant_dequant_op_out->Var()->Name();
+    int bit_length =
+        BOOST_GET_CONST(int, quant_dequant_op->Op()->GetAttr("bit_length"));
+    int range = ((1 << (bit_length - 1)) - 1);
 
+    // Get input scale from tensor
     std::string input_scale_var_name =
         quant_dequant_op->Op()->Input("InScale").front();
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, platform::errors::InvalidArgument(
+                   "Scope in DeleteQuantDequantOpPass should not be null."));
     const LoDTensor& input_scale_tensor =
-        scope->GetVar(input_scale_var_name)->Get<LoDTensor>();
-
+        scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(
+        paddle::platform::is_cpu_place(input_scale_tensor.place()), true,
+        platform::errors::InvalidArgument(
+            "Input scale tensor's place should be CPU."));
     const float* input_scale_data = input_scale_tensor.data<float>();
-    float input_scale = input_scale_data[0] / 127.;
-    auto* any_op2_desc = any_op2->Op();
-    // auto input_args_names = any_op2_desc->InputArgumentNames();
-    auto var_map = any_op2_desc->Inputs();
-    std::string arg_name = "";
-    for (auto& name_m : var_map) {
-      if (std::find(name_m.second.begin(), name_m.second.end(),
-                    quant_dequant_op_out_name) != name_m.second.end()) {
-        arg_name = name_m.first;
-      }
-    }
-    CHECK(arg_name.size() > 0) << "can not find the input "
-                               << quant_dequant_op_out_name;
-    any_op2_desc->SetAttr("enable_int8", true);
-    any_op2_desc->SetAttr(arg_name + "_scale", input_scale);
+    float input_scale = input_scale_data[0] / range;
 
-    // modify the any_op2's inputs
-    for (auto& name_m : var_map) {
-      if (std::find(name_m.second.begin(), name_m.second.end(),
-                    quant_dequant_op_out_name) != name_m.second.end()) {
-        std::vector<std::string> new_inputs;
-        for (auto& i_n : name_m.second) {
-          if (i_n != quant_dequant_op_out_name) {
-            new_inputs.push_back(i_n);
-          }
-        }
-        new_inputs.push_back(any_op_out_name);
-        any_op2_desc->SetInput(name_m.first, new_inputs);
-        any_op2_desc->Flush();
+    // Set input scale in attr, and relink nodes
+    std::string input_name = input->Var()->Name();
+    std::string quant_dequant_output_name = quant_dequant_op_out->Var()->Name();
+    auto outlinks = quant_dequant_op_out->outputs;
+    for (auto* quantized_node : outlinks) {
+      auto op_desc = quantized_node->Op();
+      std::string quantized_op_type = op_desc->Type();
+      if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
+          quantized_op_type == "matmul_v2") {
+        op_desc->SetAttr("X_scale", input_scale);
+      } else {
+        op_desc->SetAttr("Input_scale", input_scale);
       }
+      op_desc->SetAttr("bit_length", bit_length);
+      op_desc->RenameInput(quant_dequant_output_name, input_name);
+      op_desc->Flush();
+      IR_NODE_LINK_TO(input, quantized_node);
     }
-    any_op2_desc->Flush();
+
     // Delete the unneeded nodes.
     GraphSafeRemoveNodes(graph,
-                         {quant_dequant_op, quant_dequant_op_out,
-                          quant_dequant_op_inscale, quant_dequant_op_outscale});
+                         {quant_dequant_op_inscale, quant_dequant_op,
+                          quant_dequant_op_outscale, quant_dequant_op_out});
+    found_count++;
   };
-
   gpd(graph, handler);
+  AddStatis(found_count);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 0bb2782b3737e..4510aea925e78 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -135,7 +135,7 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
     }
 
     // Create an FC Node.
-    OpDesc desc;
+    OpDesc desc(mul->Op()->Block());
     desc.SetType("fc");
 
     // Set inputs of fc
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
index a862755d604e4..df3fbc293b78e 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
@@ -88,7 +88,7 @@ std::unique_ptr<ir::Graph> PrepareGraph(
              nullptr, false, false, activation, gate_activation);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  return std::move(graph);
+  return graph;
 }
 }  // namespace fc_gru_test
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
index f681a2b7ff8eb..a313e49f0b2b6 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
@@ -91,7 +91,7 @@ std::unique_ptr<ir::Graph> PrepareGraph(
               false, gate_activation, cell_activation, candidate_activation);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  return std::move(graph);
+  return graph;
 }
 
 }  // namespace fc_lstm_test
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index a174aa88d937b..036fde8fac6d9 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/operator.h"
 
-DEFINE_bool(convert_all_blocks, true,
-            "Convert all blocks in program into SSAgraphs");
+PADDLE_DEFINE_EXPORTED_bool(convert_all_blocks, true,
+                            "Convert all blocks in program into SSAgraphs");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index a73bc487c92cc..5f7bfc61b4229 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 DECLARE_bool(convert_all_blocks);
-DEFINE_string(print_sub_graph_dir, "",
-              "FLAGS_print_sub_graph_dir is used "
-              "to print the nodes of sub_graphs.");
+PADDLE_DEFINE_EXPORTED_string(print_sub_graph_dir, "",
+                              "FLAGS_print_sub_graph_dir is used "
+                              "to print the nodes of sub_graphs.");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index e1b77a59911fb..4150d0ca555c9 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2547,39 +2547,28 @@ void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) {
   reshape2_out->LinksFrom({reshape2_op});
 }
 
-void patterns::DeleteQuantDequantOpPattern::operator()() {
-  auto any_op_out =
-      pattern->NewNode(any_op_out_repr())
-          ->assert_is_op_input(
-              "fake_quantize_dequantize_moving_average_abs_max", "X")
-          ->AsInput();
-
+void patterns::DeleteQuantDequantOpPattern::operator()(
+    PDNode *input_node, const std::string &quantdequant_types) {
   auto quant_dequant_op_inscale =
       pattern->NewNode(quant_dequant_op_inscale_repr())
-          ->assert_is_op_input(
-              "fake_quantize_dequantize_moving_average_abs_max", "InScale")
+          ->assert_is_op_input(quantdequant_types, "InScale")
           ->AsInput();
-  auto quant_dequant_op =
-      pattern->NewNode(quant_dequant_op_repr())
-          ->assert_is_op("fake_quantize_dequantize_moving_average_abs_max");
+  auto quant_dequant_op = pattern->NewNode(quant_dequant_op_repr())
+                              ->assert_is_op(quantdequant_types);
 
-  auto quant_dequant_out =
+  auto quant_dequant_op_out =
       pattern->NewNode(quant_dequant_op_out_repr())
-          ->assert_is_op_output(
-              "fake_quantize_dequantize_moving_average_abs_max", "Out")
-          ->AsIntermediate();
+          ->assert_is_op_output(quantdequant_types, "Out")
+          ->AsOutput();
 
   auto quant_dequant_op_outscale =
       pattern->NewNode(quant_dequant_op_outscale_repr())
-          ->assert_is_op_output(
-              "fake_quantize_dequantize_moving_average_abs_max", "OutScale")
+          ->assert_is_op_output(quantdequant_types, "OutScale")
           ->AsOutput();
-  auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput();
 
-  quant_dequant_op->LinksFrom({any_op_out, quant_dequant_op_inscale});
+  quant_dequant_op->LinksFrom({quant_dequant_op_inscale, input_node});
   quant_dequant_op_outscale->LinksFrom({quant_dequant_op});
-  quant_dequant_out->LinksFrom({quant_dequant_op});
-  any_op2->LinksFrom({quant_dequant_out});
+  quant_dequant_op_out->LinksFrom({quant_dequant_op});
 }
 
 void patterns::DeleteQuantDequantFilterOpPattern::operator()() {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 3cfaa4661ae68..40c3e4f59bf26 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1481,14 +1481,12 @@ struct DeleteQuantDequantOpPattern : public PatternBase {
   DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {}
 
-  void operator()();
+  void operator()(PDNode* input_node, const std::string& quantdequant_types);
 
-  PATTERN_DECL_NODE(any_op_out);
   PATTERN_DECL_NODE(quant_dequant_op_inscale);
   PATTERN_DECL_NODE(quant_dequant_op);
   PATTERN_DECL_NODE(quant_dequant_op_outscale);
   PATTERN_DECL_NODE(quant_dequant_op_out);
-  PATTERN_DECL_NODE(any_op2);
 };
 
 struct DeleteQuantDequantFilterOpPattern : public PatternBase {
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
index b8666c1c73ee0..864055cfa3620 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -220,7 +220,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
         LOG(WARNING) << "Pass in op compat failed.";
         return;
       }
-      OpDesc desc;
+      OpDesc desc(matmul_op->Op()->Block());
       desc.SetType("mul");
       desc.SetInput("X", {matmul_in_x->Name()});
       desc.SetInput("Y", {matmul_in_y->Name()});
@@ -299,7 +299,7 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         LOG(WARNING) << "Pass in op compat failed.";
         return;
       }
-      OpDesc desc;
+      OpDesc desc(matmul_op->Op()->Block());
       desc.SetType("mul");
       desc.SetInput("X", {squeeze2_in_x->Name()});
       desc.SetInput("Y", {matmul_in_y->Name()});
@@ -441,7 +441,7 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         LOG(WARNING) << "Pass in op compat failed.";
         return;
       }
-      OpDesc desc;
+      OpDesc desc(matmul_op->Op()->Block());
       desc.SetType("mul");
       desc.SetInput("X", {reshape2_in_x->Name()});
       desc.SetInput("Y", {matmul_in_y->Name()});
@@ -526,7 +526,7 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         LOG(WARNING) << "Pass in op compat failed.";
         return;
       }
-      OpDesc desc;
+      OpDesc desc(matmul_op->Op()->Block());
       desc.SetType("mul");
       desc.SetInput("X", {flatten2_in_x->Name()});
       desc.SetInput("Y", {matmul_in_y->Name()});
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index a8147fd466b52..c826e1c5a584a 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -62,7 +62,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
     //    BOOST_GET_CONST(bool, scale->Op()->GetAttr("bias_after_scale"));
 
     // create multihead
-    OpDesc multihead_op_desc;
+    OpDesc multihead_op_desc(mul0->Op()->Block());
 
     // create tmp tensor
     VarDesc k_var_desc(*mul1_out->Var());
@@ -847,7 +847,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     int head_number =
         BOOST_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape")).at(2);
 
-    OpDesc multihead_op_desc;
+    OpDesc multihead_op_desc(mul0->Op()->Block());
     multihead_op_desc.SetType("multihead_matmul");
 
     multihead_op_desc.SetInput("Input", {input0->Name()});
@@ -1287,7 +1287,7 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
     int head_number =
         BOOST_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape")).at(2);
 
-    OpDesc multihead_op_desc;
+    OpDesc multihead_op_desc(mul0->Op()->Block());
     multihead_op_desc.SetType("multihead_matmul");
 
     multihead_op_desc.SetInput("Input", {input0->Name()});
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 09744bf60032e..365083a34782a 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -2,9 +2,9 @@ set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_f
 lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
 graph_to_program_pass variable_helper timer monitor)
 
-cc_library(workqueue SRCS workqueue.cc DEPS enforce)
+cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
 cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS})
-cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS})
+cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue)
 cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog)
 cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context)
 cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager)
diff --git a/paddle/fluid/framework/new_executor/event_count.h b/paddle/fluid/framework/new_executor/event_count.h
index f374456ca3814..0c6d49042d22d 100644
--- a/paddle/fluid/framework/new_executor/event_count.h
+++ b/paddle/fluid/framework/new_executor/event_count.h
@@ -50,6 +50,7 @@
 #include <cstdlib>
 #include <mutex>
 #include <vector>
+#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
 
 namespace paddle {
 namespace framework {
@@ -60,7 +61,7 @@ class EventCount {
 
   explicit EventCount(size_t waiter_num) : state_(kStackMask) {
     assert(waiter_num < (1 << kWaiterBits) - 1);
-    void* buffer = malloc(sizeof(Waiter) * waiter_num);
+    void* buffer = AlignedMalloc(sizeof(Waiter) * waiter_num, alignof(Waiter));
     if (buffer == nullptr) {
       return;
     }
@@ -78,7 +79,7 @@ class EventCount {
   ~EventCount() {
     // Ensure there are no waiters.
     assert(state_.load() == kStackMask);
-    free(waiters_);
+    AlignedFree(waiters_);
   }
 
   Waiter* GetWaiter(size_t waiter_index) {
diff --git a/paddle/fluid/framework/new_executor/event_manager.cc b/paddle/fluid/framework/new_executor/event_manager.cc
index a3eb1abaa6127..bd83f49db1d0e 100644
--- a/paddle/fluid/framework/new_executor/event_manager.cc
+++ b/paddle/fluid/framework/new_executor/event_manager.cc
@@ -24,13 +24,15 @@ void EventManager::WaitEvent(const Instruction& instruction,
 
   VLOG(3) << "Deal StreamWaitEventOrSync for "
           << instruction.kernel_func_.operator_base_->Type();
-  auto* dev_ctx = instruction.dev_ctx_;
 
-  WaitOrSync(instruction.intput_events_, dev_ctx);
+  for (auto& event_iter : instruction.intput_events_) {
+    VLOG(3) << "wait var_id: " << event_iter.var_id_
+            << " 's event with waiter_type: " << event_iter.waiter_type_;
+    event_iter.event_->Wait(event_iter.waiter_type_, instruction.dev_ctx_);
+  }
 }
 
 void EventManager::RecordEvent(const Instruction& instruction,
-                               const OpFuncNode& op_func_node,
                                const platform::Place& place) {
   // If InterpreterCore in on CPUPlace, do nothing.
   if (platform::is_cpu_place(place)) return;
@@ -41,18 +43,5 @@ void EventManager::RecordEvent(const Instruction& instruction,
   }
 }
 
-void EventManager::WaitOrSync(const std::vector<EventInter>& events,
-                              const platform::DeviceContext* dev_ctx) {
-  for (auto& event_iter : events) {
-    if (event_iter.is_sync_) {
-      VLOG(3) << "host sync wait in_var_id " << event_iter.var_id_;
-      event_iter.event_->Wait(platform::kCPU, dev_ctx);
-    } else {
-      VLOG(3) << "stream async wait in_var_id " << event_iter.var_id_;
-      event_iter.event_->Wait(platform::kCUDA, dev_ctx);
-    }
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/event_manager.h b/paddle/fluid/framework/new_executor/event_manager.h
index a2f7b52732ee2..d23c240469f96 100644
--- a/paddle/fluid/framework/new_executor/event_manager.h
+++ b/paddle/fluid/framework/new_executor/event_manager.h
@@ -21,14 +21,9 @@ namespace framework {
 class EventManager {
  public:
   void RecordEvent(const Instruction& instruction,
-                   const OpFuncNode& op_func_node,
                    const platform::Place& place);
 
   void WaitEvent(const Instruction& instruction, const platform::Place& place);
-
- private:
-  void WaitOrSync(const std::vector<EventInter>& events,
-                  const platform::DeviceContext* dev_ctx);
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 2a7d0d05e90a8..7d9d3d5fef14a 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -18,11 +18,15 @@
 #include <unordered_set>
 
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+#include "paddle/fluid/platform/profiler.h"
 
-DEFINE_bool(new_executor_use_inplace, true, "Use inplace in new executor");
+PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
+                            "Use inplace in new executor");
 
 namespace paddle {
 namespace framework {
+// NOTE(Aurelius84): Need a better strategy to determine it.
+static constexpr size_t kHostNumThreads = 4;
 
 InterpreterCore::InterpreterCore(const platform::Place& place,
                                  const ProgramDesc& main_prog,
@@ -32,7 +36,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
     : place_(place),
       main_program_(main_prog),
       global_scope_(global_scope),
-      stream_analyzer_(place) {
+      stream_analyzer_(place),
+      async_work_queue_(kHostNumThreads) {
   is_build_ = false;
 
   feed_names_ = feed_names;
@@ -89,7 +94,7 @@ paddle::framework::FetchList InterpreterCore::Run(
     Convert();
   } else {
     FeedInput();
-    ExecuteInstructionList(vec_instruction_, *global_scope_, place_);
+    ExecuteInstructionList(vec_instruction_);
   }
 
   // return Fetch Tensors
@@ -112,6 +117,7 @@ void InterpreterCore::Convert() {
     temp_inst.kernel_func_.operator_base_ = op_base;
     temp_inst.input_index_ = vec_func_list_[i].input_index;
     temp_inst.output_index_ = vec_func_list_[i].output_index;
+    temp_inst.type_ = vec_func_list_[i].type_;
 
     OpInOutInfo info;
 
@@ -168,8 +174,8 @@ void InterpreterCore::Convert() {
       }
     }
 
-    // In Program, op order is a very import information.
-    // Op can noly add op after it as next as next ops.
+    // In Program, op order is a very important information.
+    // Op can only add op after it as next as next ops.
     std::vector<size_t> filter_next;
     filter_next.reserve(vec_temp.size());
     for (auto item : vec_temp) {
@@ -178,8 +184,7 @@ void InterpreterCore::Convert() {
       }
     }
 
-    stream_analyzer_.Schedule(vec_func_list_, filter_next, i,
-                              &vec_instruction_);
+    stream_analyzer_.Schedule(filter_next, &vec_instruction_, i);
 
     for (auto inst_id : filter_next) {
       dependecy_count_[inst_id]++;
@@ -204,6 +209,23 @@ void InterpreterCore::Convert() {
   }
 }
 
+bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
+  if (!global_scope_->vec_meta_info_[var_index].vardesc_) {
+    return input_var2op_info_[var_index].size() == 1;
+  } else {
+    int is_input_cnt = 0;
+    for (auto inst_id : input_var2op_info_[var_index]) {
+      OpInOutInfo info;
+      info.Build(vec_instruction_[inst_id].kernel_func_.operator_base_);
+      if (info.IsInArgBufferNeeded(
+              global_scope_->vec_meta_info_[var_index].vardesc_->Name())) {
+        is_input_cnt++;
+      }
+    }
+    return is_input_cnt == 1;
+  }
+}
+
 void InterpreterCore::BuildInplace() {
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     if (!vec_instruction_[i]
@@ -219,7 +241,7 @@ void InterpreterCore::BuildInplace() {
     for (auto& pair : in_to_outs) {
       auto iter = vec_instruction_[i].input_index_.find(pair.first);
       if (iter != vec_instruction_[i].input_index_.end()) {
-        if (input_var2op_info_[iter->second[0]].size() == 1) {
+        if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
           auto iterout = vec_instruction_[i].output_index_.find(pair.second);
           if (iterout != vec_instruction_[i].output_index_.end()) {
             auto invar = global_scope_->var_list[iter->second[0]];
@@ -227,6 +249,15 @@ void InterpreterCore::BuildInplace() {
             if (invar && outvar) {
               vec_instruction_[i].vec_inplace_in_to_out_.emplace_back(invar,
                                                                       outvar);
+              VLOG(3) << "inplace "
+                      << vec_instruction_[i].kernel_func_.operator_base_->Type()
+                      << " "
+                      << global_scope_->vec_meta_info_[iter->second[0]]
+                             .vardesc_->Name()
+                      << " -> "
+                      << global_scope_->vec_meta_info_[iterout->second[0]]
+                             .vardesc_->Name()
+                      << std::endl;
             }
           }
         }
@@ -300,9 +331,12 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   VLOG(3) << "RunInstruction:  "
           << instr_node.kernel_func_.operator_base_->Type();
 
-  static_cast<const framework::OperatorWithKernel*>(
-      instr_node.kernel_func_.operator_base_)
-      ->InferShape(instr_node.infershape_ctx_.get());
+  {
+    platform::RecordEvent infershape_event("InferShape");
+    static_cast<const framework::OperatorWithKernel*>(
+        instr_node.kernel_func_.operator_base_)
+        ->InferShape(instr_node.infershape_ctx_.get());
+  }
 
   if (FLAGS_new_executor_use_inplace) {
     for (auto& pair : instr_node.vec_inplace_in_to_out_) {
@@ -314,67 +348,79 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
       }
     }
   }
-
-  instr_node.kernel_func_.compute_func_(*instr_node.execution_ctx_.get());
+  {
+    platform::RecordEvent compute_event("Compute");
+    instr_node.kernel_func_.compute_func_(*instr_node.execution_ctx_.get());
+  }
 }
 
 void InterpreterCore::ExecuteInstructionList(
-    const std::vector<Instruction>& vec_instr, const VariableScope& var_scope,
-    const platform::Place& place, bool is_dry_run) {
-  std::queue<size_t> working_queue;
-  auto working_dependecy_count = dependecy_count_;
+    const std::vector<Instruction>& vec_instr) {
+  auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_);
+  auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_);
+  std::atomic<size_t> op_run_number{0};
+
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
-      working_queue.push(i);
+      async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() {
+        RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number);
+      });
     }
   }
 
-  auto working_var_ref = vec_meta_info_;
-
-  size_t run_op_number = 0;
-  while (!working_queue.empty()) {
-    auto instr_id = working_queue.front();
-    working_queue.pop();
-    auto& instr_node = vec_instr[instr_id];
-    // step1 : stream_wait (non-block host) or sync (block host)
-    event_manager_.WaitEvent(instr_node, place_);
-    // step2: run instruction
-    RunInstruction(instr_node);
-    ++run_op_number;
-
-    if (is_dry_run) {
-      dry_run_profiler_.ParseMemoryInfo(var_scope.var_list);
-    }
-
-    // step3: insert event for out_vars if needed
-    event_manager_.RecordEvent(instr_node, vec_func_list_[instr_id], place_);
+  async_work_queue_.WaitEmpty();
 
-    // step4: update working_queue
-    auto& next_instr = instr_node.next_instruction_.all_next_ops_;
+  PADDLE_ENFORCE_EQ(
+      op_run_number.load(), vec_instr.size(),
+      platform::errors::Fatal(
+          "Required op_run_number == %d, but received op_run_number = %d.",
+          vec_instr.size(), op_run_number.load()));
+}
 
-    for (auto next_i : next_instr) {
-      --working_dependecy_count[next_i];
-      if (working_dependecy_count[next_i] == 0) {
-        working_queue.push(next_i);
-      }
+void InterpreterCore::RunInstructionAsync(size_t instr_id,
+                                          AtomicVectorSizeT* atomic_deps,
+                                          AtomicVectorSizeT* atomic_var_ref,
+                                          std::atomic<size_t>* op_run_number) {
+  auto& instr_node = vec_instruction_[instr_id];
+  platform::RecordEvent instruction_event(
+      instr_node.kernel_func_.operator_base_->Type());
+  event_manager_.WaitEvent(instr_node, place_);
+
+  RunInstruction(instr_node);
+
+  event_manager_.RecordEvent(instr_node, place_);
+  op_run_number->fetch_add(1, std::memory_order_relaxed);
+
+  auto& next_instr = instr_node.next_instruction_.all_next_ops_;
+
+  for (auto next_i : next_instr) {
+    // fetch_sub return value before applying sub
+    bool is_ready =
+        atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1;
+    if (is_ready) {
+      async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() {
+        RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number);
+      });
     }
-
-    // GC infomation
-    CheckGC(instr_id, instr_node.gc_check_var_list, var_scope, place,
-            working_var_ref);
   }
+  // GC infomation
+  CheckGC(instr_id, instr_node.gc_check_var_list, atomic_var_ref);
 }
 
 void InterpreterCore::CheckGC(size_t instr_id,
                               const std::vector<size_t>& gc_check_list,
-                              const VariableScope& var_scope,
-                              const platform::Place& place,
-                              std::vector<VariableMetaInfo>& working_var_ref) {
+                              AtomicVectorSizeT* atomic_var_ref) {
+  auto& var_scope = *global_scope_;
+
   for (auto var_id : gc_check_list) {
-    --working_var_ref[var_id].var_ref_count_;
-    if (var_scope.vec_meta_info_[var_id].vardesc_ &&
-        !var_scope.vec_meta_info_[var_id].vardesc_->Persistable() &&
-        working_var_ref[var_id].var_ref_count_ == 0) {
+    bool is_ready = atomic_var_ref->at(var_id)->fetch_sub(
+                        1, std::memory_order_relaxed) == 1;
+    if (is_ready && var_scope.vec_meta_info_[var_id].vardesc_ &&
+        !var_scope.vec_meta_info_[var_id].vardesc_->Persistable()) {
+      gc_.Add(var_scope.var_list[var_id], gc_event_[instr_id],
+              vec_instruction_[instr_id].dev_ctx_);
+    } else if (is_ready &&
+               var_scope.vec_meta_info_[var_id].vardesc_ == nullptr) {
       gc_.Add(var_scope.var_list[var_id], gc_event_[instr_id],
               vec_instruction_[instr_id].dev_ctx_);
     }
@@ -417,8 +463,7 @@ const CostInfo& InterpreterCore::DryRun(
   // DryRun may be called many times.
   dry_run_profiler_.Reset();
   dry_run_profiler_.Start();
-  ExecuteInstructionList(vec_instruction_, *global_scope_, place_,
-                         /*is_dry_run=*/true);
+  ExecuteInstructionList(vec_instruction_);
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
 
   dry_run_profiler_.Pause();
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index d1eff9272d658..e594f9ca8b54b 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -29,10 +29,12 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/device_event.h"
 
 namespace paddle {
 namespace framework {
+using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
 
 class InterpreterCore {
  public:
@@ -55,19 +57,21 @@ class InterpreterCore {
 
   void BuildInplace();
 
+  bool BuildInplaceCheckVarIsOnlyInput(size_t var_index);
+
   void RunInstruction(const Instruction& instr_node);
 
-  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr,
-                              const VariableScope& var_scope,
-                              const platform::Place& place,
-                              bool is_dry_run = false);
+  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
 
   void DryRunPrepare(const std::vector<framework::Tensor>& feed_tensors);
 
   void CheckGC(size_t instr_id, const std::vector<size_t>& gc_check_list,
-               const VariableScope& var_scope, const platform::Place& place,
-               std::vector<VariableMetaInfo>& working_var_ref);  // NOLINT
+               AtomicVectorSizeT* working_var_ref);
 
+  void RunInstructionAsync(size_t instr_id,
+                           AtomicVectorSizeT* working_dependecy_count,
+                           AtomicVectorSizeT* working_var_ref,
+                           std::atomic<size_t>* op_run_number);
   void AddFetch(const std::vector<std::string>& fetch_names);
 
   void BuildSkipShareLoDInfo();
@@ -93,6 +97,7 @@ class InterpreterCore {
   InterpreterProfiler dry_run_profiler_;
   StreamAnalyzer stream_analyzer_;
   EventManager event_manager_;
+  interpretercore::AsyncWorkQueue async_work_queue_;
 
   InterpreterCoreGarbageCollector gc_;
   std::vector<paddle::platform::DeviceEvent> gc_event_;
diff --git a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
index 0f90e37c7b706..2ae84d9dcdddd 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
@@ -23,8 +23,8 @@ InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() {
   max_memory_size_ = static_cast<size_t>(GetEagerDeletionThreshold());
   cur_memory_size_ = 0;
 
-  WorkQueueOptions options;
-  options.num_threads = 1;
+  WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true,
+                           /*track_task*/ false);
   queue_ = CreateSingleThreadedWorkQueue(options);
 }
 
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 9802d5fa91255..16df5d794f4d4 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -18,6 +18,27 @@ namespace paddle {
 namespace framework {
 namespace interpretercore {
 
+AtomicVectorSizeT AsyncWorkQueue::PrepareAtomicDeps(
+    const std::vector<size_t>& dependecy_count) {
+  AtomicVectorSizeT working_dependecy_count(dependecy_count.size());
+  for (size_t i = 0; i < dependecy_count.size(); ++i) {
+    working_dependecy_count[i] =
+        std::make_unique<std::atomic<size_t>>(dependecy_count[i]);
+  }
+  return working_dependecy_count;
+}
+
+AtomicVectorSizeT AsyncWorkQueue::PrepareAtomicVarRef(
+    const std::vector<VariableMetaInfo>& vec_meta_info) {
+  AtomicVectorSizeT working_var_ref(vec_meta_info.size());
+
+  for (size_t i = 0; i < vec_meta_info.size(); ++i) {
+    working_var_ref[i] =
+        std::make_unique<std::atomic<size_t>>(vec_meta_info[i].var_ref_count_);
+  }
+  return working_var_ref;
+}
+
 bool var_can_be_deleted(const std::string& name, const BlockDesc& block) {
   auto* var_desc = block.FindVar(name);
   if (var_desc == nullptr || var_desc->Persistable()) {
@@ -344,7 +365,9 @@ void build_op_func_list(const platform::Place& place,
               OpKernelComputeFunc(kernel_iter->second);
           copy_op_func_node.kernel_func_(copy_exec_ctx);
           VLOG(3) << "Run " << memcpy_op_type << " done.";
-          copy_op_func_node.type_ = OpFuncType::kQueueAsync;
+          // NOTE(Aurelius84): memcpy_op is expensive operation, so we tag them
+          // as kQueueSync and execute them in thread pool.
+          copy_op_func_node.type_ = OpFuncType::kQueueSync;
           copy_op_func_node.dev_ctx_ = dev_ctx;
           op_list->push_back(copy_op);
           vec_func_list->push_back(copy_op_func_node);
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 95c7bdac90a34..259f1c615533d 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -32,6 +32,7 @@
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/new_executor/workqueue.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -48,6 +49,39 @@ namespace framework {
 
 namespace interpretercore {
 
+using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
+
+class AsyncWorkQueue {
+ public:
+  explicit AsyncWorkQueue(size_t host_num_threads)
+      : host_num_thread_(host_num_threads) {
+    std::vector<WorkQueueOptions> group_options;
+    // for execute host Kernel
+    group_options.emplace_back(/*num_threads*/ host_num_threads,
+                               /*allow_spinning*/ true,
+                               /*track_task*/ true);
+    // for launch device Kernel
+    group_options.emplace_back(/*num_threads*/ 1,
+                               /*allow_spinning*/ true, /*track_task*/ true);
+    queue_group_ = CreateWorkQueueGroup(group_options);
+  }
+
+  AtomicVectorSizeT PrepareAtomicDeps(
+      const std::vector<size_t>& dependecy_count);
+  AtomicVectorSizeT PrepareAtomicVarRef(
+      const std::vector<VariableMetaInfo>& vec_meta_info);
+
+  void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
+
+  void AddTask(const OpFuncType& op_func_type, std::function<void()> fn) {
+    queue_group_->AddTask(static_cast<size_t>(op_func_type), std::move(fn));
+  }
+
+ private:
+  size_t host_num_thread_;
+  std::unique_ptr<WorkQueueGroup> queue_group_;
+};
+
 std::string get_memcpy_type(const platform::Place& src_place,
                             const platform::Place& dst_place);
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index ebbe3ed17bcbe..9c0444b3157cb 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -25,11 +25,6 @@
 namespace paddle {
 namespace framework {
 
-namespace interpretercore {
-static constexpr char kMemcpyH2D[] = "memcpy_h2d";
-static constexpr char kMemcpyD2H[] = "memcpy_d2h";
-}  // namespace interpretercore
-
 using OpKernelComputeFunc = std::function<void(const ExecutionContext&)>;
 using OpKernelMap =
     std::unordered_map<OpKernelType, OpKernelComputeFunc, OpKernelType::Hash>;
@@ -496,17 +491,23 @@ struct NextInstruction {
 struct EventInter {
   explicit EventInter(size_t var_id,
                       std::shared_ptr<platform::DeviceEvent> event,
-                      bool is_sync)
-      : var_id_(var_id), event_(event), is_sync_(is_sync) {}
+                      platform::DeviceType waiter_type)
+      : var_id_(var_id), event_(event), waiter_type_(waiter_type) {}
   size_t var_id_;
   std::shared_ptr<platform::DeviceEvent> event_;
-  bool is_sync_;
+  platform::DeviceType waiter_type_;
 };
 
 struct InstructionInfo {
   std::vector<size_t> dependecy_count_;
 };
 
+enum class OpFuncType {
+  kQueueSync = 0,   // CPU kernel, block host
+  kQueueAsync = 1,  // GPU Kernel or d2h, h2d, send, recv, broadcast
+};
+class RuntimeInferShapeContext;
+
 struct Instruction {
   OpKernelFunc kernel_func_;
   std::shared_ptr<RuntimeContext> runtime_ctx_;
@@ -522,15 +523,11 @@ struct Instruction {
   std::vector<EventInter> output_events_;
 
   platform::DeviceContext* dev_ctx_;  // not owned
+  OpFuncType type_;
 
   std::vector<std::pair<Variable*, Variable*>> vec_inplace_in_to_out_;
 };
 
-enum class OpFuncType {
-  kQueueAsync,  // GPU Kernel or d2h, h2d, send, recv, broadcast
-  kQueueSync,   // CPU kernel, block host
-};
-
 struct OpFuncNode {
   // int unsed;
   std::map<std::string, std::vector<int>> input_index;
@@ -541,5 +538,18 @@ struct OpFuncNode {
   OpFuncType type_;
 };
 
+namespace interpretercore {
+static constexpr char kMemcpyH2D[] = "memcpy_h2d";
+static constexpr char kMemcpyD2H[] = "memcpy_d2h";
+
+static bool IsMemcpyH2D(const Instruction& instr) {
+  return instr.kernel_func_.operator_base_->Type() == kMemcpyH2D;
+}
+
+static bool IsMemcpyD2H(const Instruction& instr) {
+  return instr.kernel_func_.operator_base_->Type() == kMemcpyD2H;
+}
+}  // namespace interpretercore
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
index 56edcecd17f37..2997ce1fe2473 100644
--- a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
@@ -56,9 +56,9 @@ class TaskTracker {
   }
 
  private:
-  std::atomic<uint64_t> num_tasks_{0};
-  EventCount wait_empty_cv_;
-  std::atomic<bool> wait_empty_{false};
+  alignas(64) std::atomic<uint64_t> num_tasks_{0};
+  alignas(64) EventCount wait_empty_cv_;
+  alignas(64) std::atomic<bool> wait_empty_{false};
 };
 
 template <typename Environment>
@@ -70,15 +70,16 @@ class ThreadPoolTempl {
   ThreadPoolTempl(int num_threads, bool allow_spinning,
                   Environment env = Environment())
       : env_(env),
-        num_threads_(num_threads),
         allow_spinning_(allow_spinning),
-        thread_data_(num_threads),
         global_steal_partition_(EncodePartition(0, num_threads_)),
         blocked_(0),
+        num_tasks_(0),
         spinning_(0),
         done_(false),
         cancelled_(false),
-        ec_(num_threads_) {
+        ec_(num_threads),
+        num_threads_(num_threads),
+        thread_data_(num_threads) {
     // Calculate coprimes of all numbers [1, num_threads].
     // Coprimes are used for random walks over all threads in Steal
     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
@@ -143,6 +144,7 @@ class ThreadPoolTempl {
   void AddTaskWithHint(std::function<void()> fn, int start, int limit) {
     Task t = env_.CreateTask(std::move(fn));
     PerThread* pt = GetPerThread();
+    uint64_t num_tasks = num_tasks_.fetch_add(1, std::memory_order_relaxed) + 1;
     if (pt->pool == this) {
       // Worker thread of this pool, push onto the thread's queue.
       Queue& q = thread_data_[pt->thread_id].queue;
@@ -166,8 +168,11 @@ class ThreadPoolTempl {
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
     if (!t.f) {
-      ec_.Notify(false);
+      if (num_tasks > num_threads_ - blocked_.load(std::memory_order_relaxed)) {
+        ec_.Notify(false);
+      }
     } else {
+      num_tasks_.fetch_sub(1, std::memory_order_relaxed);
       env_.ExecuteTask(t);  // Push failed, execute directly.
     }
   }
@@ -259,16 +264,17 @@ class ThreadPoolTempl {
   };
 
   Environment env_;
-  const int num_threads_;
   const bool allow_spinning_;
-  std::vector<ThreadData> thread_data_;
   std::vector<std::vector<unsigned>> all_coprimes_;
   unsigned global_steal_partition_;
   std::atomic<unsigned> blocked_;
+  std::atomic<uint64_t> num_tasks_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
   std::atomic<bool> cancelled_;
   EventCount ec_;
+  const int num_threads_;
+  std::vector<ThreadData> thread_data_;
 
   // Main worker thread loop.
   void WorkerLoop(int thread_id) {
@@ -305,6 +311,7 @@ class ThreadPoolTempl {
         }
         if (t.f) {
           env_.ExecuteTask(t);
+          num_tasks_.fetch_sub(1, std::memory_order_relaxed);
         }
       }
     } else {
@@ -315,8 +322,7 @@ class ThreadPoolTempl {
           if (!t.f) {
             t = GlobalSteal();
             if (!t.f) {
-              // Leave one thread spinning. This reduces latency.
-              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+              if (allow_spinning_) {
                 for (int i = 0; i < spin_count && !t.f; i++) {
                   if (!cancelled_.load(std::memory_order_relaxed)) {
                     t = GlobalSteal();
@@ -324,7 +330,6 @@ class ThreadPoolTempl {
                     return;
                   }
                 }
-                spinning_ = false;
               }
               if (!t.f) {
                 if (!WaitForWork(waiter, &t)) {
@@ -336,6 +341,7 @@ class ThreadPoolTempl {
         }
         if (t.f) {
           env_.ExecuteTask(t);
+          num_tasks_.fetch_sub(1, std::memory_order_relaxed);
         }
       }
     }
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 5d8ec05b7f28e..77783535b6471 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -64,10 +64,8 @@ static std::pair<size_t, size_t> GetTensorMemorySize(
 }
 
 struct CostInfo {
-  double total_time{0.};                // ms
-  size_t host_memory_bytes{0};          // bytes
-  size_t device_memory_bytes{0};        // bytes
-  size_t device_total_memory_bytes{0};  // total allocated memory size
+  double total_time{0.};          // ms
+  size_t device_memory_bytes{0};  // total allocated memory size
 };
 
 class InterpreterProfiler {
@@ -82,30 +80,14 @@ class InterpreterProfiler {
   void Reset() {
     timer_.Reset();
     cost_info_.total_time = 0.;
-    cost_info_.host_memory_bytes = 0;
     cost_info_.device_memory_bytes = 0;
-    cost_info_.device_total_memory_bytes = 0;
-  }
-
-  void ParseMemoryInfo(const std::vector<Variable*>& vars) {
-    timer_.Start();
-    auto memory_info = GetTensorMemorySize(vars);
-    VLOG(3) << "host memory size: " << memory_info.first;
-    cost_info_.host_memory_bytes =
-        std::max(cost_info_.host_memory_bytes, memory_info.first);
-
-    VLOG(3) << "device memory size: " << memory_info.second;
-    cost_info_.device_memory_bytes =
-        std::max(cost_info_.device_memory_bytes, memory_info.second);
-    timer_.Pause();
-    cost_info_.total_time -= timer_.ElapsedMS();
   }
 
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
-      cost_info_.device_total_memory_bytes =
+      cost_info_.device_memory_bytes =
           platform::RecordedCudaMallocSize(cuda_place.device);
 #endif
     }
diff --git a/paddle/fluid/framework/new_executor/run_queue.h b/paddle/fluid/framework/new_executor/run_queue.h
index 707aadd315885..13035237ff8b4 100644
--- a/paddle/fluid/framework/new_executor/run_queue.h
+++ b/paddle/fluid/framework/new_executor/run_queue.h
@@ -204,7 +204,6 @@ class RunQueue {
     kReady,
   };
 
-  std::mutex mutex_;
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
   // front/back, respectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
@@ -214,6 +213,7 @@ class RunQueue {
   // modification counters.
   alignas(64) std::atomic<unsigned> front_;
   alignas(64) std::atomic<unsigned> back_;
+  std::mutex mutex_;
   Elem array_[kSize];
 
   // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc
index 13bbda0f31f42..a9322d8fc88ed 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -22,7 +22,7 @@ namespace framework {
  * Parse the var_ids that need to be associated with an event.
  * The caller should guarantee front_op and back_op satisfy the
  * following conditions:
- *   1. kQueueAsync -> kQueueAsync
+ *   1. kQueueSync -> kQueueAsync
  *   2. kQueueAsync -> kQueueSync
  *
  * For example: matmul(gpu) -> out_var -> memcpy_d2h
@@ -48,7 +48,7 @@ std::vector<size_t> StreamAnalyzer::ParseEventVarIds(
 
 void StreamAnalyzer::AssociateInputWithEvents(
     const std::vector<size_t>& new_event_var_id, Instruction* next_instr,
-    bool is_sync) {
+    platform::DeviceType waiter_type) {
   for (auto var_id : new_event_var_id) {
     if (var_id2event_.count(var_id) == 0) {
       auto device_event = std::make_shared<platform::DeviceEvent>(
@@ -57,52 +57,43 @@ void StreamAnalyzer::AssociateInputWithEvents(
     }
     // Add events for next_instr.inputs
     next_instr->intput_events_.emplace_back(var_id, var_id2event_.at(var_id),
-                                            is_sync);
+                                            waiter_type);
   }
 }
 
-void StreamAnalyzer::Schedule(const std::vector<OpFuncNode>& op_func_nodes,
-                              const std::vector<size_t>& downstream_ops,
-                              size_t op_index,
-                              std::vector<Instruction>* instructions) {
-  auto& op_func_type = op_func_nodes[op_index].type_;
+void StreamAnalyzer::Schedule(const std::vector<size_t>& downstream_ops,
+                              std::vector<Instruction>* instructions,
+                              size_t op_index) {
   auto& cur_instr = instructions->at(op_index);
   auto& next_instruction = cur_instr.next_instruction_;
+  std::vector<size_t> event_var_ids;
+  for (auto next_op_id : downstream_ops) {
+    auto& next_instr = instructions->at(next_op_id);
 
-  if (op_func_type == OpFuncType::kQueueSync) {
-    // all downstream ops of kQueueSync can directly run, such as CPU -> Any
-    next_instruction.direct_run_ = downstream_ops;
-  } else {  // kQueueAsync
-    std::vector<size_t> event_var_ids;
-    for (auto next_op_id : downstream_ops) {
-      auto& next_instr = instructions->at(next_op_id);
-      // case 1: GPU -> GPU(same stream)
-      if (cur_instr.dev_ctx_ == next_instr.dev_ctx_) {
-        next_instruction.direct_run_.emplace_back(next_op_id);
-        continue;
-      }
+    if (IsDirectRun(cur_instr, next_instr)) {
+      next_instruction.direct_run_.emplace_back(next_op_id);
+    } else {
       // Always insert events between different stream
       auto new_event_var_ids = ParseEventVarIds(cur_instr, next_instr);
       event_var_ids.insert(event_var_ids.end(), new_event_var_ids.begin(),
                            new_event_var_ids.end());
 
-      bool is_sync =
-          (op_func_nodes[next_op_id].type_ == OpFuncType::kQueueSync);
-      AssociateInputWithEvents(new_event_var_ids, &next_instr, is_sync);
+      auto waiter_type = GetWaiterType(next_instr);
+      AssociateInputWithEvents(new_event_var_ids, &next_instr, waiter_type);
 
-      if (is_sync) {  // GPU -> CPU
+      if (waiter_type == platform::kCPU) {  // GPU -> CPU
         next_instruction.synchronize_run_.emplace_back(next_op_id);
       } else {  // GPU -> GPU(different stream)
         next_instruction.event_wait_run_.emplace_back(next_op_id);
       }
     }
-    // Create events for these cross-stream vars
-    VLOG(3) << cur_instr.kernel_func_.operator_base_->Type()
-            << " event_var_ids.size: " << event_var_ids.size();
-    for (auto var_id : event_var_ids) {
-      cur_instr.output_events_.emplace_back(var_id, var_id2event_.at(var_id),
-                                            false /*not used*/);
-    }
+  }
+  // Create events for these cross-stream vars
+  VLOG(3) << cur_instr.kernel_func_.operator_base_->Type()
+          << " event_var_ids.size: " << event_var_ids.size();
+  for (auto var_id : event_var_ids) {
+    cur_instr.output_events_.emplace_back(var_id, var_id2event_.at(var_id),
+                                          platform::kCUDA /*not used*/);
   }
 }
 
@@ -121,5 +112,27 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
   return dev_ctx;
 }
 
+/*
+ * NOTE(dev): The following cases are considered as directly run:
+ *
+ *  1. with same dev_ctx_, such as: CPU -> CPU, GPU -> GPU
+ *  2. D2H -> CPU
+ *  3. CPU -> H2D
+ */
+bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr,
+                                 const Instruction& next_instr) {
+  return (cur_instr.dev_ctx_ == next_instr.dev_ctx_ ||
+          interpretercore::IsMemcpyD2H(cur_instr) ||
+          interpretercore::IsMemcpyH2D(next_instr));
+}
+
+platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
+  if (instr.type_ == OpFuncType::kQueueSync) {
+    return platform::kCPU;
+  } else {
+    return platform::kCUDA;
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h
index ee94c21fc529a..dc2af389e36b0 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.h
@@ -29,9 +29,8 @@ class StreamAnalyzer {
 
   ~StreamAnalyzer() {}
 
-  void Schedule(const std::vector<OpFuncNode>& op_func_nodes,
-                const std::vector<size_t>& downstream_ops, size_t op_index,
-                std::vector<Instruction>* instructions);
+  void Schedule(const std::vector<size_t>& downstream_ops,
+                std::vector<Instruction>* instructions, size_t op_index);
 
   platform::DeviceContext* ParseDeviceContext(const OpFuncNode& op_func_node,
                                               const OperatorBase& op_base);
@@ -41,7 +40,14 @@ class StreamAnalyzer {
                                        const Instruction& next_instr);
 
   void AssociateInputWithEvents(const std::vector<size_t>& new_event_var_id,
-                                Instruction* next_instr, bool is_sync);
+                                Instruction* next_instr,
+                                platform::DeviceType waiter_type);
+
+  bool IsDirectRun(Instruction& cur_instr,  // NOLINT
+                   const Instruction& next_instr);
+
+  platform::DeviceType GetWaiterType(const Instruction& instr);
+
   platform::Place place_;
   platform::DeviceContextPool d2h_ctx_pool_;
   platform::DeviceContextPool h2d_ctx_pool_;
diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc
index 184d9d6998464..bc5a4e27dc528 100644
--- a/paddle/fluid/framework/new_executor/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue.cc
@@ -18,14 +18,18 @@ class WorkQueueImpl : public WorkQueue {
   explicit WorkQueueImpl(const WorkQueueOptions& options)
       : WorkQueue(options), queue_(nullptr), tracker_(nullptr) {
     if (options_.track_task) {
-      tracker_ = new TaskTracker;
+      void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker));
+      tracker_ = new (storage) TaskTracker;
     }
     queue_ = new NonblockingThreadPool(options_.num_threads,
                                        options_.allow_spinning);
   }
 
   virtual ~WorkQueueImpl() {
-    delete tracker_;
+    if (tracker_ != nullptr) {
+      tracker_->~TaskTracker();
+      AlignedFree(tracker_);
+    }
     delete queue_;
   }
 
@@ -89,7 +93,8 @@ WorkQueueGroupImpl::WorkQueueGroupImpl(
   for (size_t idx = 0; idx < num_queues; ++idx) {
     const auto& options = queues_options_[idx];
     if (options.track_task && tracker_ == nullptr) {
-      tracker_ = new TaskTracker;
+      void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker));
+      tracker_ = new (storage) TaskTracker;
     }
     queues_[idx] = new (&queues_storage_[idx])
         NonblockingThreadPool(options.num_threads, options.allow_spinning);
@@ -100,7 +105,10 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() {
   for (auto queue : queues_) {
     queue->~NonblockingThreadPool();
   }
-  delete tracker_;
+  if (tracker_ != nullptr) {
+    tracker_->~TaskTracker();
+    AlignedFree(tracker_);
+  }
   free(queues_storage_);
 }
 
@@ -147,7 +155,7 @@ std::unique_ptr<WorkQueue> CreateSingleThreadedWorkQueue(
                         "For a SingleThreadedWorkQueue, "
                         "WorkQueueOptions.num_threads must equals to 1."));
   std::unique_ptr<WorkQueue> ptr(new WorkQueueImpl(options));
-  return std::move(ptr);
+  return ptr;
 }
 
 std::unique_ptr<WorkQueue> CreateMultiThreadedWorkQueue(
@@ -158,7 +166,7 @@ std::unique_ptr<WorkQueue> CreateMultiThreadedWorkQueue(
                                         "WorkQueueOptions.num_threads must be "
                                         "greater than 1."));
   std::unique_ptr<WorkQueue> ptr(new WorkQueueImpl(options));
-  return std::move(ptr);
+  return ptr;
 }
 
 std::unique_ptr<WorkQueueGroup> CreateWorkQueueGroup(
@@ -168,7 +176,7 @@ std::unique_ptr<WorkQueueGroup> CreateWorkQueueGroup(
                         "For a WorkQueueGroup, the number of WorkQueueOptions "
                         "must be greater than 1."));
   std::unique_ptr<WorkQueueGroup> ptr(new WorkQueueGroupImpl(queues_options));
-  return std::move(ptr);
+  return ptr;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/workqueue.h b/paddle/fluid/framework/new_executor/workqueue.h
index 32e90641bbc2b..ead9d9949b700 100644
--- a/paddle/fluid/framework/new_executor/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue.h
@@ -22,9 +22,14 @@ namespace paddle {
 namespace framework {
 
 struct WorkQueueOptions {
-  size_t num_threads{0};
-  bool allow_spinning{true};
-  bool track_task{false};
+  WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task)
+      : num_threads(num_threads),
+        allow_spinning(allow_spinning),
+        track_task(track_task) {}
+
+  size_t num_threads;
+  bool allow_spinning;
+  bool track_task;
 };
 
 class WorkQueue {
diff --git a/paddle/fluid/framework/new_executor/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue_test.cc
index cec1274259ef0..c229a84b145ab 100644
--- a/paddle/fluid/framework/new_executor/workqueue_test.cc
+++ b/paddle/fluid/framework/new_executor/workqueue_test.cc
@@ -26,9 +26,8 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   std::atomic<unsigned> counter{0};
   constexpr unsigned kLoopNum = 1000000;
   // CreateSingleThreadedWorkQueue
-  WorkQueueOptions options;
-  options.num_threads = 1;
-  options.track_task = true;
+  WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true,
+                           /*track_task*/ true);
   auto work_queue = CreateSingleThreadedWorkQueue(options);
   // NumThreads
   EXPECT_EQ(work_queue->NumThreads(), 1u);
@@ -58,9 +57,8 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   constexpr unsigned kExternalLoopNum = 100;
   constexpr unsigned kLoopNum = 1000000;
   // CreateMultiThreadedWorkQueue
-  WorkQueueOptions options;
-  options.num_threads = 10;
-  options.track_task = true;
+  WorkQueueOptions options(/*num_threads*/ 10, /*allow_spinning*/ true,
+                           /*track_task*/ true);
   auto work_queue = CreateMultiThreadedWorkQueue(options);
   // NumThreads
   EXPECT_EQ(work_queue->NumThreads(), 10u);
@@ -91,12 +89,10 @@ TEST(WorkQueue, TestWorkQueueGroup) {
   constexpr unsigned kExternalLoopNum = 100;
   constexpr unsigned kLoopNum = 1000000;
   // CreateMultiThreadedWorkQueue
-  WorkQueueOptions sq_options;
-  sq_options.num_threads = 1;
-  sq_options.track_task = true;
-  WorkQueueOptions mq_options;
-  mq_options.num_threads = 10;
-  mq_options.track_task = true;
+  WorkQueueOptions sq_options(/*num_threads*/ 1, /*allow_spinning*/ true,
+                              /*track_task*/ true);
+  WorkQueueOptions mq_options(/*num_threads*/ 10, /*allow_spinning*/ true,
+                              /*track_task*/ true);
   auto queue_group = CreateWorkQueueGroup({sq_options, mq_options});
   // NumThreads
   EXPECT_EQ(queue_group->QueueNumThreads(0), 1u);
diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue_utils.cc
new file mode 100644
index 0000000000000..2ea49e676a807
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/workqueue_utils.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
+#include <cstdint>
+#include <cstdlib>
+
+namespace paddle {
+namespace framework {
+
+void* AlignedMalloc(size_t size, size_t alignment) {
+  assert(alignment >= sizeof(void*) && (alignment & (alignment - 1)) == 0);
+  size = (size + alignment - 1) / alignment * alignment;
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
+  void* aligned_mem = nullptr;
+  if (posix_memalign(&aligned_mem, alignment, size) != 0) {
+    aligned_mem = nullptr;
+  }
+  return aligned_mem;
+#elif defined(_WIN32)
+  return _aligned_malloc(size, alignment);
+#else
+  void* mem = malloc(size + alignment);
+  if (mem == nullptr) {
+    return nullptr;
+  }
+  size_t adjust = alignment - reinterpret_cast<uint64_t>(mem) % alignment;
+  void* aligned_mem = reinterpret_cast<char*>(mem) + adjust;
+  *(reinterpret_cast<void**>(aligned_mem) - 1) = mem;
+  assert(reinterpret_cast<uint64_t>(aligned_mem) % alignment == 0);
+  return aligned_mem;
+#endif
+}
+
+void AlignedFree(void* mem_ptr) {
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
+  free(mem_ptr);
+#elif defined(_WIN32)
+  _aligned_free(mem_ptr);
+#else
+  if (mem_ptr) {
+    free(*(reinterpret_cast<void**>(mem_ptr) - 1));
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h
index 00183eadcbb5c..6907f2f17da0d 100644
--- a/paddle/fluid/framework/new_executor/workqueue_utils.h
+++ b/paddle/fluid/framework/new_executor/workqueue_utils.h
@@ -59,5 +59,9 @@ class CounterGuard {
   Holder* counter_holder_{nullptr};
 };
 
+void* AlignedMalloc(size_t size, size_t alignment);
+
+void AlignedFree(void* memory_ptr);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 51e5df3e168c3..0eafbb027f042 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -178,7 +178,7 @@ class OpDesc {
   }
 
   proto::OpDesc desc_;
-  BlockDesc *block_;  // not_own
+  BlockDesc *block_{nullptr};  // not_own
   // input arg name => input variable names
   VariableNameMap inputs_;
   // output arg name => output variable names
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 0d5db737441db..670cb36dcc3ab 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -47,7 +47,8 @@ class LoDTensor;
 DECLARE_bool(benchmark);
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
-DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
+PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
+                             "number of threads for inner op");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 516a3bc63cad6..adbbfb380bc45 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -46,11 +46,13 @@ DECLARE_double(eager_delete_tensor_gb);
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
-DEFINE_string(pe_profile_fname, "",
-              "Profiler filename for PE, which generated by gperftools."
-              "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
-DEFINE_bool(enable_parallel_graph, false,
-            "Force disable parallel graph execution mode if set false.");
+PADDLE_DEFINE_EXPORTED_string(
+    pe_profile_fname, "",
+    "Profiler filename for PE, which generated by gperftools."
+    "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
+PADDLE_DEFINE_EXPORTED_bool(
+    enable_parallel_graph, false,
+    "Force disable parallel graph execution mode if set false.");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index d0558abaf5184..b577608de6c59 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -180,6 +180,35 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
   for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
     auto& op_desc = *op_iter;
 
+    // TODO(wanghaipeng03) reconstruct the follwing if/else block
+    //                     to extract common code
+    //
+    // bool should_run_flag = false;
+    // if (IsTarget........) {
+    //   should_run_flag = true;
+    // } else {
+    //   if (parent......) {
+    //     for (....) {
+    //       for (.....) {
+    //         if (.....) {
+    //           should_run_flag = true;
+    //         }
+    //       }
+    //     }
+    //   }
+    // }
+    //
+    // should_run.push_back(should_run_flag);
+    // if (should_run_flag) {
+    //   for (auto & var: op_desc.iputs()) {
+    //     for (....) {
+    //       if (.....) {
+    //         dependent_vars->insert(argu);
+    //       }
+    //     }
+    //   }
+    // }
+
     if (IsTarget(op_desc) ||
         (HasDependentOutputVar(op_desc, *dependent_vars) &&
          (GetOpRole(op_desc) & static_cast<int>(OpRole::kOptimize)) == 0)) {
@@ -213,6 +242,13 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
         }
         if (flag) {
           should_run.back() = true;
+
+          // If any op should run, then there inputs are dependent_vars
+          for (auto& var : op_desc.inputs()) {
+            for (auto& argu : var.arguments()) {
+              dependent_vars->insert(argu);
+            }
+          }
         }
       }
     }
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index d299f1769253a..932974855a28e 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 DECLARE_bool(benchmark);
 
-DEFINE_bool(
+PADDLE_DEFINE_EXPORTED_bool(
     eager_delete_scope, true,
     "Delete local scope eagerly. It will reduce GPU memory usage but "
     "slow down the destruction of variables.(around 1% performance harm)");
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index f8ace3e85a643..2f03dc41ce002 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -23,9 +23,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
-DEFINE_bool(enable_unused_var_check, false,
-            "Checking whether operator contains unused inputs, "
-            "especially for grad operator. It should be in unittest.");
+PADDLE_DEFINE_EXPORTED_bool(
+    enable_unused_var_check, false,
+    "Checking whether operator contains unused inputs, "
+    "especially for grad operator. It should be in unittest.");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 595aba887303d..cb744fb2b6aa2 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(imperative_flag SRCS flags.cc DEPS gflags)
+cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
 
 IF(WITH_XPU)
 cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
@@ -11,7 +11,7 @@ cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
 cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
-cc_library(imperative_profiler SRCS profiler.cc)
+cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
 if(NOT WIN32)
     if(WITH_NCCL OR WITH_RCCL)
         cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index eba30ff8edebf..48e5e430b136a 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -117,7 +117,7 @@ static inline std::shared_ptr<imperative::VarBase> CastToType(
   imperative::NameVarBaseMap outs = {{"Out", {out}}};
 
   {
-    AutoCastGuard guard(tracer, false);
+    AutoCastGuard guard(tracer, 0);
     tracer->TraceOp("cast", ins, outs, std::move(attrs));
   }
 
@@ -225,5 +225,30 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
   return new_ins;
 }
 
+NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
+                                  const NameVarBaseMap& ins) {
+  NameVarBaseMap new_ins(ins);
+  auto dst_type = framework::proto::VarType::FP16;
+  if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
+      AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
+    dst_type = framework::proto::VarType::FP32;
+  }
+  for (auto& pair : new_ins) {
+    if ((op_type == "batch_norm" || op_type == "layer_norm" ||
+         op_type == "sync_batch_norm") &&
+        pair.first != "X") {
+      continue;
+    }
+    VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+            << GetDtypeStr(*pair.second.cbegin()) << " to "
+            << framework::DataTypeToString(dst_type);
+    for (auto& var : pair.second) {
+      var = (dst_type == framework::proto::VarType::FP32 ? CastToFP32(var)
+                                                         : CastToFP16(var));
+    }
+  }
+  return new_ins;
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index fa76c19688a69..79bc83a777aa9 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -63,15 +63,16 @@ std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
 // NOTE(zhiqiu): AutoCastGuard is used for RAII.
 class AutoCastGuard {
  public:
-  AutoCastGuard(std::shared_ptr<Tracer> tracer, bool guard_mode)
+  AutoCastGuard(std::shared_ptr<Tracer> tracer, int guard_level)
       : tracer_(tracer) {
-    pre_mode_ = tracer_->IsAutoCastEnabled();
-    if (pre_mode_ != guard_mode) {
-      tracer_->SetEnableAutoCast(guard_mode);
+    pre_amp_level_ = tracer_->AMPLevel();
+
+    if (pre_amp_level_ != guard_level) {
+      tracer_->SetAMPLevel(guard_level);
     }
   }
 
-  ~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); }
+  ~AutoCastGuard() { tracer_->SetAMPLevel(pre_amp_level_); }
 
   // forbid copy and operator=
   AutoCastGuard(const AutoCastGuard& guard) = delete;
@@ -79,11 +80,14 @@ class AutoCastGuard {
 
  private:
   std::shared_ptr<Tracer> tracer_;
-  bool pre_mode_;
+  int pre_amp_level_;
 };
 
 NameVarBaseMap AutoCastInputs(const std::string& op_type,
                               const NameVarBaseMap& ins);
 
+NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
+                                  const NameVarBaseMap& ins);
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc
index 57656d64ab788..c2d668eccdaf9 100644
--- a/paddle/fluid/imperative/flags.cc
+++ b/paddle/fluid/imperative/flags.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/flags.h"
-#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
 
-DEFINE_uint64(dygraph_debug, 0,
-              "Debug level of dygraph. This flag is not "
-              "open to users");
+PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0,
+                              "Debug level of dygraph. This flag is not "
+                              "open to users");
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 84ba60fef80d5..c1ec675a55707 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -137,10 +137,12 @@ static void GetGraphInfoBetweenTargets(
     }
 
     for (auto &pending_node : node->GradPendingNodes()) {
+      for (auto &pending_op : *pending_node) {
+        preceding_ops[&pending_op].insert(op);
+      }
       if (visited.count(pending_node.get()) == 0) {
         visited.insert(pending_node.get());
         for (auto &pending_op : *pending_node) {
-          preceding_ops[&pending_op].insert(op);
           q.emplace(&pending_op, pending_node.get());
         }
       }
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
index 6d0f6a12f5229..48af63056c5e3 100644
--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
@@ -19,9 +19,9 @@
 #endif
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
-#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
 
-DEFINE_string(
+PADDLE_DEFINE_EXPORTED_string(
     tracer_profile_fname, "xxgperf",
     "Profiler filename for imperative tracer, which generated by gperftools."
     "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 9dc9c4d90acab..49e079c58caf3 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -176,9 +176,12 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                               : attr_checker->GetDefaultAttrMap();
 
   NameVarBaseMap new_ins = ins;
-  if (enable_autocast_) {
+  if (amp_level_ == 1) {
     VLOG(5) << "Auto mixed precision run operator: " << type;
     new_ins = AutoCastInputs(type, ins);
+  } else if (amp_level_ == 2) {
+    VLOG(5) << "Pure fp16 run operator: " << type;
+    new_ins = CastPureFp16Inputs(type, ins);
   }
 
   try {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index b734ae5c49936..e77623d7a4609 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -105,9 +105,9 @@ class Tracer {
 
   void SetHasGrad(bool has_grad) { has_grad_ = has_grad; }
 
-  void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; }
+  void SetAMPLevel(int level) { amp_level_ = level; }
 
-  bool IsAutoCastEnabled() const { return enable_autocast_; }
+  int AMPLevel() const { return amp_level_; }
 
   paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
       const platform::Place& place);
@@ -118,9 +118,9 @@ class Tracer {
   bool enable_program_desc_tracing_{false};
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
-  bool enable_autocast_{false};
   GarbageCollectorMap gcs_;
   static thread_local bool has_grad_;
+  int amp_level_{0};
 };
 
 // To access static variable current_tracer
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index b24005cb6d9ac..cda6dc31126d9 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -239,6 +239,22 @@ struct Argument {
   DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
   DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
 
+  DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool);
+  DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, NNAdapterModelCacheDir,
+                      std::string);
+  DECL_ARGUMENT_FIELD(nnadapter_device_names, NNAdapterDeviceNames,
+                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(nnadapter_context_properties, NNAdapterContextProperties,
+                      std::string);
+  DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_buffer,
+                      NNAdapterSubgraphPartitionConfigBuffer, std::string);
+  DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_path,
+                      NNAdapterSubgraphPartitionConfigPath, std::string);
+  DECL_ARGUMENT_FIELD(nnadapter_model_cache_token, NNAdapterModelCacheToken,
+                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(nnadapter_model_cache_buffer, NNAdapterModelCacheBuffer,
+                      std::vector<std::vector<char>>);
+
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
 
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 8eb7e8d13886f..4fdd963b6abff 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -202,6 +202,27 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new std::string(argument->xpu_autotune_file()));
       pass->Set("precision", new std::string(argument->xpu_precision()));
       pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
+      // NNAdapter Related
+      pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
+      pass->Set("nnadapter_model_cache_dir",
+                new std::string(argument->nnadapter_model_cache_dir()));
+      pass->Set(
+          "nnadapter_device_names",
+          new std::vector<std::string>(argument->nnadapter_device_names()));
+      pass->Set("nnadapter_context_properties",
+                new std::string(argument->nnadapter_context_properties()));
+      pass->Set("nnadapter_subgraph_partition_config_buffer",
+                new std::string(
+                    argument->nnadapter_subgraph_partition_config_buffer()));
+      pass->Set("nnadapter_subgraph_partition_config_path",
+                new std::string(
+                    argument->nnadapter_subgraph_partition_config_path()));
+      pass->Set("nnadapter_model_cache_buffer",
+                new std::vector<std::vector<char>>(
+                    argument->nnadapter_model_cache_buffer()));
+      pass->Set("nnadapter_model_cache_token",
+                new std::vector<std::string>(
+                    argument->nnadapter_model_cache_token()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index b8cac8992f4ee..c04342f837e3f 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -250,12 +250,30 @@ void LiteSubgraphPass::SetUpEngine(
   std::string autotune_file = Get<std::string>("autotune_file");
   std::string precision = Get<std::string>("precision");
   bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
+  // NNAdapter Related
+  bool use_nnadapter = Get<bool>("use_nnadapter");
+  std::string nnadapter_model_cache_dir =
+      Get<std::string>("nnadapter_model_cache_dir");
+  auto nnadapter_device_names =
+      Get<std::vector<std::string>>("nnadapter_device_names");
+  std::string nnadapter_context_properties =
+      Get<std::string>("nnadapter_context_properties");
+  std::string nnadapter_subgraph_partition_config_buffer =
+      Get<std::string>("nnadapter_subgraph_partition_config_buffer");
+  std::string nnadapter_subgraph_partition_config_path =
+      Get<std::string>("nnadapter_subgraph_partition_config_path");
+  auto nnadapter_model_cache_buffer =
+      Get<std::vector<std::vector<char>>>("nnadapter_model_cache_buffer");
+  auto nnadapter_model_cache_token =
+      Get<std::vector<std::string>>("nnadapter_model_cache_token");
 
   lite_api::TargetType target_type;
   if (use_gpu) {
     target_type = TARGET(kCUDA);
   } else if (use_xpu) {
     target_type = TARGET(kXPU);
+  } else if (use_nnadapter) {
+    target_type = TARGET(kNNAdapter);
   } else {
 #ifdef PADDLE_WITH_ARM
     target_type = TARGET(kARM);
@@ -292,6 +310,17 @@ void LiteSubgraphPass::SetUpEngine(
   config.autotune_file = autotune_file;
   config.precision = precision;
   config.adaptive_seqlen = adaptive_seqlen;
+  // NNAdapter Related
+  config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
+  config.nnadapter_device_names = nnadapter_device_names;
+  config.nnadapter_context_properties = nnadapter_context_properties;
+  config.nnadapter_subgraph_partition_config_buffer =
+      nnadapter_subgraph_partition_config_buffer;
+  config.nnadapter_subgraph_partition_config_path =
+      nnadapter_subgraph_partition_config_path;
+  config.nnadapter_model_cache_buffer = nnadapter_model_cache_buffer;
+  config.nnadapter_model_cache_token = nnadapter_model_cache_token;
+
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
     lite::StrToBinaryFile("./param.bin", config.param);
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index ac540c75511ef..5d056e054f51c 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -207,6 +207,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // NPU related.
   CP_MEMBER(use_npu_);
   CP_MEMBER(npu_device_id_);
+  CP_MEMBER(nnadapter_config_);
 
   // profile related.
   CP_MEMBER(with_profile_);
@@ -554,7 +555,7 @@ void AnalysisConfig::Update() {
   }
 
   if (use_npu_) {
-#ifdef PADDLE_WITH_ASCEND_CL
+#if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU)
     PADDLE_ENFORCE_EQ(use_gpu_, false,
                       platform::errors::Unavailable(
                           "Currently, NPU and GPU cannot be enabled in the "
@@ -833,6 +834,61 @@ std::string AnalysisConfig::Summary() {
   return os.PrintTable();
 }
 
+LiteNNAdapterConfig &LiteNNAdapterConfig::SetDeviceNames(
+    const std::vector<std::string> &names) {
+  nnadapter_device_names = names;
+  return *this;
+}
+
+LiteNNAdapterConfig &LiteNNAdapterConfig::SetContextProperties(
+    const std::string &properties) {
+  nnadapter_context_properties = properties;
+  return *this;
+}
+
+LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheDir(
+    const std::string &dir) {
+  nnadapter_model_cache_dir = dir;
+  return *this;
+}
+
+LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheBuffers(
+    const std::string &model_cache_token,
+    const std::vector<char> &model_cache_buffer) {
+  PADDLE_ENFORCE_EQ(model_cache_token.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "model_cache_token should not be empty."));
+  PADDLE_ENFORCE_EQ(model_cache_buffer.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "model_cache_buffer should not be empty."));
+  PADDLE_ENFORCE_EQ(nnadapter_model_cache_buffers.count(model_cache_token),
+                    false, platform::errors::InvalidArgument(
+                               "model_cache_token has already been set."));
+
+  nnadapter_model_cache_buffers[model_cache_token] = model_cache_buffer;
+  return *this;
+}
+
+LiteNNAdapterConfig &LiteNNAdapterConfig::SetSubgraphPartitionConfigPath(
+    const std::string &path) {
+  nnadapter_subgraph_partition_config_path = path;
+  return *this;
+}
+
+LiteNNAdapterConfig &LiteNNAdapterConfig::SetSubgraphPartitionConfigBuffer(
+    const std::string &buffer) {
+  nnadapter_subgraph_partition_config_buffer = buffer;
+  return *this;
+}
+LiteNNAdapterConfig &LiteNNAdapterConfig::Enable() {
+  use_nnadapter = true;
+  return *this;
+}
+LiteNNAdapterConfig &LiteNNAdapterConfig::Disable() {
+  use_nnadapter = false;
+  return *this;
+}
+
 void AnalysisConfig::CollectShapeRangeInfo(
     const std::string &shape_range_info_path) {
   LOG(INFO) << "In CollectShapeInfo mode, we will disable optimizations and "
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index f8491e2abf734..804f035a2e2ca 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -276,6 +276,22 @@ bool AnalysisPredictor::CreateExecutor() {
         "You tried to use NPU forward propagation, but Paddle was not compiled "
         "with WITH_ASCEND_CL."));
 #endif
+  } else if (config_.NNAdapter().use_nnadapter) {
+    if (config_.lite_engine_enabled()) {
+      place_ = paddle::platform::CPUPlace();
+#ifndef LITE_SUBGRAPH_WITH_NNADAPTER
+      PADDLE_THROW(
+          platform::errors::Unavailable("You tried to use an NNAdapter lite "
+                                        "engine, but Paddle was not compiled "
+                                        "with it."));
+#endif  // LITE_SUBGRAPH_WITH_NNADAPTER
+    } else {
+      PADDLE_THROW(
+          platform::errors::Unavailable("You tried to use NNadapter forward "
+                                        "propagation (inference without lite "
+                                        "engine), but Paddle was not compiled "
+                                        "with LITE_WITH_NNADAPTER."));
+    }
   } else {
     place_ = paddle::platform::CPUPlace();
   }
@@ -601,6 +617,26 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_);
     argument_.SetXpuPrecision(config_.xpu_precision_);
     argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
+    // NNAdapter related
+    argument_.SetUseNNAdapter(config_.NNAdapter().use_nnadapter);
+    argument_.SetNNAdapterDeviceNames(
+        config_.NNAdapter().nnadapter_device_names);
+    argument_.SetNNAdapterContextProperties(
+        config_.NNAdapter().nnadapter_context_properties);
+    argument_.SetNNAdapterModelCacheDir(
+        config_.NNAdapter().nnadapter_model_cache_dir);
+    argument_.SetNNAdapterSubgraphPartitionConfigBuffer(
+        config_.NNAdapter().nnadapter_subgraph_partition_config_buffer);
+    argument_.SetNNAdapterSubgraphPartitionConfigPath(
+        config_.NNAdapter().nnadapter_subgraph_partition_config_path);
+    std::vector<std::string> buffer_keys;
+    std::vector<std::vector<char>> buffer_vals;
+    for (auto it : config_.NNAdapter().nnadapter_model_cache_buffers) {
+      buffer_keys.emplace_back(it.first);
+      buffer_vals.emplace_back(it.second);
+    }
+    argument_.SetNNAdapterModelCacheToken(buffer_keys);
+    argument_.SetNNAdapterModelCacheBuffer(buffer_vals);
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
@@ -645,7 +681,32 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   VLOG(5) << "to prepare executor";
   ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
   inference_program_.reset(
-      new framework::ProgramDesc(argument_.ir_analyzed_program()));
+      new framework::ProgramDesc(argument_.ir_analyzed_program()),
+      [](framework::ProgramDesc *prog) {
+// Note, please do NOT use any member variables, because member variables may
+// have been destructed in multiple threads.
+#if PADDLE_WITH_TENSORRT
+        auto &block = prog->Block(0);
+        for (auto &op_desc : block.AllOps()) {
+          if (op_desc->Type() == "tensorrt_engine") {
+            std::string engine_key =
+                BOOST_GET_CONST(std::string, op_desc->GetAttr("engine_key"));
+            int engine_predictor_id =
+                BOOST_GET_CONST(int, op_desc->GetAttr("predictor_id"));
+            std::string engine_name =
+                engine_key + std::to_string(engine_predictor_id);
+            if (paddle::inference::Singleton<
+                    inference::tensorrt::TRTEngineManager>::Global()
+                    .Has(engine_name)) {
+              paddle::inference::Singleton<
+                  inference::tensorrt::TRTEngineManager>::Global()
+                  .DeleteKey(engine_name);
+            }
+          }
+        }
+#endif
+        delete prog;
+      });
   // The config and argument take a lot of storage,
   // when the predictor settings are complete, we release these stores.
   argument_.PartiallyRelease();
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 513f3669a19ce..86fbde00075f0 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -61,6 +61,26 @@ TEST(AnalysisPredictor, analysis_off) {
   ASSERT_TRUE(predictor->Run(inputs, &outputs));
 }
 
+#ifndef WIN32
+TEST(AnalysisPredictor, lite_nn_adapter_npu) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.EnableLiteEngine();
+  config.NNAdapter()
+      .Disable()
+      .Enable()
+      .SetDeviceNames({"huawei_ascend_npu"})
+      .SetContextProperties("HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS=0")
+      .SetModelCacheDir("cache_dirr")
+      .SetSubgraphPartitionConfigPath("")
+      .SetModelCacheBuffers("c1", {'c'});
+#ifndef LITE_SUBGRAPH_WITH_NNADAPTER
+  EXPECT_THROW(CreatePaddlePredictor<AnalysisConfig>(config),
+               paddle::platform::EnforceNotMet);
+#endif
+}
+#endif
+
 TEST(AnalysisPredictor, analysis_on) {
   AnalysisConfig config;
   config.SetModel(FLAGS_dirname);
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index bb1040159470a..c1a0cb4be4429 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -192,14 +192,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
   }
-
-#ifdef __clang__
-  // fix clang compile error
   return cls;
-#else
-  // fix manylinux compile error.
-  return std::move(cls);
-#endif
 }
 
 bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
@@ -390,12 +383,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
-#ifdef __clang__
-  // fix clang compile error
   return predictor;
-#else
-  return std::move(predictor);
-#endif
 }
 
 template <>
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index a64377f80f8aa..d6a0b643c2aee 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -48,6 +48,34 @@ namespace paddle {
 class AnalysisPredictor;
 struct MkldnnQuantizerConfig;
 
+struct LiteNNAdapterConfig {
+  bool use_nnadapter{false};
+  std::string nnadapter_model_cache_dir;
+  std::map<std::string, std::vector<char>> nnadapter_model_cache_buffers;
+  std::vector<std::string> nnadapter_device_names;
+  std::string nnadapter_context_properties;
+  std::string nnadapter_subgraph_partition_config_path;
+  std::string nnadapter_subgraph_partition_config_buffer;
+
+  LiteNNAdapterConfig& SetDeviceNames(const std::vector<std::string>& names);
+
+  LiteNNAdapterConfig& SetContextProperties(const std::string& properties);
+
+  LiteNNAdapterConfig& SetModelCacheDir(const std::string& dir);
+
+  LiteNNAdapterConfig& SetModelCacheBuffers(
+      const std::string& model_cache_token,
+      const std::vector<char>& model_cache_buffer);
+
+  LiteNNAdapterConfig& SetSubgraphPartitionConfigPath(const std::string& path);
+
+  LiteNNAdapterConfig& SetSubgraphPartitionConfigBuffer(
+      const std::string& buffer);
+
+  LiteNNAdapterConfig& Enable();
+  LiteNNAdapterConfig& Disable();
+};
+
 ///
 /// \brief configuration manager for AnalysisPredictor.
 /// \since 1.7.0
@@ -692,6 +720,8 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   std::string Summary();
 
+  LiteNNAdapterConfig& NNAdapter() { return nnadapter_config_; }
+
  protected:
   // Update the config.
   void Update();
@@ -800,6 +830,9 @@ struct PD_INFER_DECL AnalysisConfig {
   std::string xpu_precision_;
   bool xpu_adaptive_seqlen_;
 
+  // NNAdapter related
+  LiteNNAdapterConfig nnadapter_config_;
+
   // mkldnn related.
   int mkldnn_cache_capacity_{10};
   bool use_mkldnn_quantizer_{false};
diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt
index 2482a6917530b..6d981d007e73a 100644
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@@ -2,8 +2,8 @@ if(XPU_SDK_ROOT)
   set(XPU_DEPS xpuapi xpurt)
 endif()
 
-cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
-cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context ${XPU_DEPS})
+cc_library(lite_op_teller SRCS op_teller.cc DEPS ${LITE_DEPS} framework_proto device_context boost xxhash)
+cc_library(lite_engine SRCS engine.cc DEPS ${LITE_DEPS} framework_proto ${XPU_DEPS})
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy ${LITE_DEPS} framework_proto boost device_context ${XPU_DEPS})
 cc_test(test_lite_engine SRCS test_engine_lite.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
 cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 908e1ab990bb7..47b9d681b4754 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -69,6 +69,25 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
                                                cfg.adaptive_seqlen);
 #endif
 
+#ifdef LITE_SUBGRAPH_WITH_NPU
+  lite_cxx_config.set_nnadapter_device_names(cfg.nnadapter_device_names);
+  lite_cxx_config.set_nnadapter_context_properties(
+      cfg.nnadapter_context_properties);
+  lite_cxx_config.set_nnadapter_model_cache_dir(cfg.nnadapter_model_cache_dir);
+  if (!cfg.nnadapter_subgraph_partition_config_path.empty()) {
+    lite_cxx_config.set_nnadapter_subgraph_partition_config_path(
+        cfg.nnadapter_subgraph_partition_config_path);
+  }
+  if (!cfg.nnadapter_subgraph_partition_config_buffer.empty()) {
+    lite_cxx_config.set_nnadapter_subgraph_partition_config_buffer(
+        cfg.nnadapter_subgraph_partition_config_buffer);
+  }
+  for (size_t i = 0; i < cfg.nnadapter_model_cache_token.size(); ++i) {
+    lite_cxx_config.set_nnadapter_model_cache_buffers(
+        cfg.nnadapter_model_cache_token[i],
+        cfg.nnadapter_model_cache_buffer[i]);
+  }
+#endif
   // create predictor
   std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
       paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index a64ef1eda828b..48072656cb996 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -53,6 +53,15 @@ struct EngineConfig {
 
   // for cuda
   bool use_multi_stream{false};
+
+  // for nnadapter or npu.
+  std::string nnadapter_model_cache_dir;
+  std::vector<std::string> nnadapter_device_names;
+  std::string nnadapter_context_properties;
+  std::string nnadapter_subgraph_partition_config_buffer;
+  std::string nnadapter_subgraph_partition_config_path;
+  std::vector<std::string> nnadapter_model_cache_token;
+  std::vector<std::vector<char>> nnadapter_model_cache_buffer;
 };
 
 class EngineManager {
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 6bbda6bb29aad..e32d87087b66f 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -86,7 +86,20 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
   nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
   nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-  nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+  nvinfer1::DimsHW nv_paddings;
+  nvinfer1::Dims nv_pre_paddings;
+  nvinfer1::Dims nv_post_paddings;
+  if (paddings.size() == 2) {
+    nv_paddings.d[0] = paddings[0];
+    nv_paddings.d[1] = paddings[1];
+  } else {
+    nv_pre_paddings.nbDims = 2;
+    nv_post_paddings.nbDims = 2;
+    nv_pre_paddings.d[0] = paddings[0];
+    nv_pre_paddings.d[1] = paddings[2];
+    nv_post_paddings.d[0] = paddings[1];
+    nv_post_paddings.d[1] = paddings[3];
+  }
 
   TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                 static_cast<void*>(weight_data),
@@ -116,7 +129,13 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
       layer, platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose"
                                      " layer failed."));
   layer->setStride(nv_strides);
-  layer->setPadding(nv_paddings);
+  if (paddings.size() == 2) {
+    layer->setPadding(nv_paddings);
+  } else {
+    layer->setPrePadding(nv_pre_paddings);
+    layer->setPostPadding(nv_post_paddings);
+  }
+
   layer->setNbGroups(groups);
   if (padding_algorithm == "SAME") {
     layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
index 7ef79e547d09a..fb1fb4a6a7b39 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
@@ -64,9 +64,21 @@ class HardSwishOpConverter : public OpConverter {
           nvinfer1::ElementWiseOperation::kPROD);
       layer = eltwise_layer;
     } else {
-      plugin::HardSwishPlugin* plugin =
-          new plugin::HardSwishPlugin(threshold, scale, offset);
-      layer = engine_->AddPlugin(&input, input_num, plugin);
+      if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
+        plugin::HardSwishPluginDynamic* plugin =
+            new plugin::HardSwishPluginDynamic(threshold, scale, offset);
+        layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
+#else
+        PADDLE_THROW(platform::errors::Fatal(
+            "You are running the TRT Dynamic Shape mode, need to confirm that "
+            "your TRT version is no less than 6.0"));
+#endif
+      } else {
+        plugin::HardSwishPlugin* plugin =
+            new plugin::HardSwishPlugin(threshold, scale, offset);
+        layer = engine_->AddPlugin(&input, input_num, plugin);
+      }
     }
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "hard_swish", {output_name}, test_mode);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index a073acc96c0d4..3935342e70296 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -23,7 +23,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-#if IS_TRT_VERSION_GE(6000)
     VLOG(3) << "convert a fluid multihead_mamul op to a corresponding tensorrt "
                "network structure";
     framework::OpDesc op_desc(op, nullptr);
@@ -46,10 +45,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
     float in_scale = 0.;
 
     if (enable_int8) {
-      PADDLE_ENFORCE_EQ(
-          op_desc.HasAttr("Input_scale"), true,
-          platform::errors::InvalidArgument(
-              "must have input scale in multihead layers in int8 mode"));
       in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
       auto weight_scale =
           BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("weight_scale"));
@@ -181,10 +176,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
             {"hidden_size", &hidden_out, nvinfer1::PluginFieldType::kINT32, 1},
             {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
             {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
-            { "var_seqlen",
-              &var_seqlen,
-              nvinfer1::PluginFieldType::kINT32,
-              1 }};
+            {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}};
         if (qkv2context_plugin_int8) {
           fields.push_back(
               {"dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1});
@@ -296,11 +288,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
     }
     RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name},
                              test_mode);
-#else
-    PADDLE_THROW(platform::errors::Fatal(
-        "You are running the TRT Dynamic Shape mode, need to confirm that "
-        "your TRT version is no less than 6.0"));
-#endif
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 90d6392fd6404..1898f28c73ad0 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -87,6 +87,10 @@ class Pool2dOpConverter : public OpConverter {
     bool adaptive = false;
     if (op_desc.HasAttr("adaptive"))
       adaptive = BOOST_GET_CONST(bool, op_desc.GetAttr("adaptive"));
+    std::string padding_algorithm = "EXPLICIT";
+    if (op_desc.HasAttr("padding_algorithm"))
+      padding_algorithm =
+          BOOST_GET_CONST(std::string, op_desc.GetAttr("padding_algorithm"));
 
     nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
     nvinfer1::ReduceOperation reduce_operation =
@@ -124,6 +128,9 @@ class Pool2dOpConverter : public OpConverter {
         pool_layer->setStride(nv_strides);
         pool_layer->setPadding(nv_paddings);
         pool_layer->setAverageCountExcludesPadding(exclusive);
+        if (padding_algorithm == "SAME") {
+          pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+        }
         layer = pool_layer;
       } else if (global_pooling) {
         auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1,
@@ -159,6 +166,9 @@ class Pool2dOpConverter : public OpConverter {
       auto output_name = op_desc.Output("Out")[0];
       pool_layer->setStride(nv_strides);
       pool_layer->setPadding(nv_paddings);
+      if (padding_algorithm == "SAME") {
+        pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+      }
       pool_layer->setAverageCountExcludesPadding(exclusive);
       pool_layer->setName(("pool2d (Output: " + output_name + ")").c_str());
       pool_layer->getOutput(0)->setName(output_name.c_str());
@@ -198,6 +208,9 @@ class Pool2dOpConverter : public OpConverter {
                           "trt pool layer in converter could not be created."));
       pool_layer->setStride(nv_strides);
       pool_layer->setPadding(nv_paddings);
+      if (padding_algorithm == "SAME") {
+        pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+      }
       pool_layer->setAverageCountExcludesPadding(exclusive);
       layer = pool_layer;
     } else {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 29324f290064c..e22c2488d3b8b 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -631,6 +631,14 @@ class TRTEngineManager {
     }
   }
 
+  void DeleteKey(const std::string& key) {
+    auto iter = engines_.find(key);
+    if (iter != engines_.end()) {
+      iter->second.reset(nullptr);
+      engines_.erase(iter);
+    }
+  }
+
  private:
   std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 1b0c6c0a71d17..5bfd2f1277795 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -149,16 +149,15 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     return false;
 
   for (auto& teller : tellers_) {
-    if (op_type == "depthwise_conv2d") {
-      std::vector<int> paddings =
-          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
-
-      if (paddings.size() > 2) return false;
-    }
-
     if (op_type == "relu" || op_type == "relu6" || op_type == "tanh" ||
         op_type == "sigmoid") {
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -173,6 +172,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
       if (paddings.size() > 2) return false;
+      if (desc.HasAttr("exclusive")) {
+        if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) {
+          std::vector<int> ksize =
+              BOOST_GET_CONST(std::vector<int>, desc.GetAttr("ksize"));
+          for (size_t i = 0; i < ksize.size(); i++) {
+            if (ksize[i] <= paddings[i]) {
+              VLOG(3) << "the padding size should be less than the filter size "
+                         "for exclusive-counting pooling.";
+              return false;
+            }
+          }
+        }
+      }
+      if (desc.HasAttr("ceil_mode")) {
+        if (BOOST_GET_CONST(bool, desc.GetAttr("ceil_mode"))) return false;
+      }
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "TRT Pool2d expect 1 input, but got "
                 << desc.Input("X").size();
@@ -202,9 +217,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
 
-      // conv2d and conv2d_transpose need padding check
-      if (paddings.size() > 2 && op_type != "conv2d_fusion") return false;
-
       if (desc.Input("Input").size() != 1) {
         VLOG(3) << "TRT Conv2d expect 1 input, but got "
                 << desc.Input("Input").size() << " input.";
@@ -217,6 +229,14 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
 
+      if (desc.HasAttr("padding_algorithm")) {
+        auto padding_algorithm =
+            BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm"));
+        if (padding_algorithm == "SAME" || padding_algorithm == "VALID") {
+          return false;
+        }
+      }
+
       if (desc.HasAttr("enable_int8")) {
         if (op_type == "conv2d" || op_type == "conv2d_fusion") {
           if (!desc.HasAttr("Input_scale")) {
@@ -274,6 +294,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
 
     if (op_type == "matmul") {
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       for (auto& param_name : desc.Inputs()) {
         for (auto& var_name : param_name.second) {
           auto* var_desc = block->FindVar(var_name);
@@ -324,6 +350,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (axis[0] == 0 && axis.size() == 2) return false;
 
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -372,6 +404,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       } else {
         auto* block = desc.Block();
+        if (block == nullptr) {
+          VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                     "Developers need to check whether block_desc is passed in "
+                     "the pass.";
+          return false;
+        }
         auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
         const auto x_shape = x_var_desc->GetShape();
         if (x_shape.size() == 1) {
@@ -385,6 +423,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!with_dynamic_shape) return false;
 
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto index_var_name = desc.Input("Index")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
@@ -412,6 +456,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "anchor_generator") {
+      if (!with_dynamic_shape) return false;
+    }
+
     if (op_type == "yolo_box") {
       if (with_dynamic_shape) return false;
       bool has_attrs =
@@ -428,6 +476,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (data_layout != framework::DataLayout::kNCHW) return false;
 
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -439,6 +493,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     if (op_type == "multiclass_nms") {
       if (with_dynamic_shape) return false;
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       for (auto& param_name : desc.Inputs()) {
         for (auto& var_name : param_name.second) {
           auto* var_desc = block->FindVar(var_name);
@@ -598,6 +658,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -657,6 +723,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         }
       }
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -724,6 +796,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
       auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
       const auto x_shape = x_var_desc->GetShape();
@@ -775,6 +853,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
 
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -856,6 +940,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
       std::vector<int64_t> shape;
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       for (auto& param_name : desc.Inputs()) {
         for (auto& var_name : param_name.second) {
           auto* var_desc = block->FindVar(var_name);
@@ -881,6 +971,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
 
     if (op_type == "scale") {
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -892,6 +988,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
 
     if (op_type == "swish") {
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -916,6 +1018,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
 
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto* var_desc = block->FindVar(desc.Input("Alpha")[0]);
       if (!var_desc) {
         VLOG(3) << "Variable Alpha of prelu TRT converter not found.";
@@ -997,6 +1105,42 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         VLOG(3) << "the multihead_matmul does not support static shape yet";
         return false;
       }
+
+      if (desc.HasAttr("enable_int8") && !desc.HasAttr("Input_scale")) {
+        VLOG(3) << "Multihead layers must have input scale in int8 mode.";
+        return false;
+      }
+
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+      auto* input_desc = block->FindVar(desc.Input("Input").front());
+      const auto input_shape = input_desc->GetShape();
+      const auto head_number =
+          BOOST_GET_CONST(int, desc.GetAttr("head_number"));
+
+      auto* biasqk_desc = block->FindVar(desc.Input("BiasQK").front());
+      const auto biasqk_shape = biasqk_desc->GetShape();
+      // The BiasQK's shape requires to be
+      // [batch, 1, 1, length] or [batch, head, length, length].
+      bool has_same_shape = head_number == biasqk_shape[1] &&
+                            input_shape[1] == biasqk_shape[2] &&
+                            input_shape[1] == biasqk_shape[3];
+      bool is_broadcastable = biasqk_shape[1] == 1 && biasqk_shape[2] == 1 &&
+                              input_shape[1] == biasqk_shape[3];
+      if (!(has_same_shape || is_broadcastable)) {
+        VLOG(3) << "The BiasQK's shape is invalid, expect [" << input_shape[0]
+                << ", 1, 1, " << input_shape[1] << "] or [" << input_shape[0]
+                << ", " << head_number << ", " << input_shape[1] << ", "
+                << input_shape[1] << "] but [" << biasqk_shape[0] << ", "
+                << biasqk_shape[1] << ", " << biasqk_shape[2] << ", "
+                << biasqk_shape[3] << "].";
+        return false;
+      }
     }
 
     if (op_type == "fc") {
@@ -1051,6 +1195,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
 
       auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
@@ -1169,6 +1319,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "hard_sigmoid") {
+      if (!with_dynamic_shape) {
+        auto* block = desc.Block();
+        if (block == nullptr) {
+          VLOG(3) << "The block is null.";
+          return false;
+        }
+        auto x_var_name = desc.Input("X")[0];
+        auto* x_var_desc = block->FindVar(x_var_name);
+        const auto x_shape = x_var_desc->GetShape();
+        if (x_shape.size() <= 2) {
+          VLOG(3) << "hard_sigmoid op does not support input's dim less than 3 "
+                     "in tensorrt.";
+          return false;
+        }
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
index 28060bd2facbe..9872b1ff8d957 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
@@ -22,10 +22,10 @@ namespace tensorrt {
 namespace plugin {
 
 nvinfer1::Dims HardSwishPlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims* in_dims, int nb_inputs) TRT_NOEXCEPT {
+    int index, const nvinfer1::Dims *in_dims, int nb_inputs) TRT_NOEXCEPT {
   assert(nb_inputs == 1);
   assert(index < this->getNbOutputs());
-  nvinfer1::Dims const& input_dims = in_dims[0];
+  nvinfer1::Dims const &input_dims = in_dims[0];
   nvinfer1::Dims output_dims = input_dims;
   return output_dims;
 }
@@ -42,7 +42,7 @@ __device__ T kMin(T a, T b) {
 
 template <typename T, unsigned TPB>
 __global__ void hard_swish_kernel(float threshold, float scale, float offset,
-                                  int n, const T* input, T* output) {
+                                  int n, const T *input, T *output) {
   const int idx = blockIdx.x * TPB + threadIdx.x;
   if (idx < n) {
     const T in = input[idx];
@@ -50,14 +50,14 @@ __global__ void hard_swish_kernel(float threshold, float scale, float offset,
   }
 }
 
-int HardSwishPlugin::enqueue(int batch_size, const void* const* inputs,
+int HardSwishPlugin::enqueue(int batch_size, const void *const *inputs,
 #if IS_TRT_VERSION_LT(8000)
-                             void** outputs, void*, cudaStream_t stream) {
+                             void **outputs, void *, cudaStream_t stream) {
 #else
-                             void* const* outputs, void*,
+                             void *const *outputs, void *,
                              cudaStream_t stream) TRT_NOEXCEPT {
 #endif
-  const auto& input_dims = this->getInputDims(0);
+  const auto &input_dims = this->getInputDims(0);
   int num = batch_size;
   for (int i = 0; i < input_dims.nbDims; i++) {
     num *= input_dims.d[i];
@@ -69,14 +69,79 @@ int HardSwishPlugin::enqueue(int batch_size, const void* const* inputs,
   const int block_size = 256;
   const int grid_size = (num + block_size - 1) / block_size;
 
-  const float* input = static_cast<const float*>(inputs[0]);
-  float* output = static_cast<float*>(outputs[0]);
+  const float *input = static_cast<const float *>(inputs[0]);
+  float *output = static_cast<float *>(outputs[0]);
   hard_swish_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
       threshold, scale, offset, num, input, output);
 
   return cudaGetLastError() != cudaSuccess;
 }
 
+#if IS_TRT_VERSION_GE(6000)
+
+nvinfer1::DimsExprs HardSwishPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
+  return inputs[0];
+}
+
+int HardSwishPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT {
+  auto input_dims = input_desc[0].dims;
+  int num = 1;
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    num *= input_dims.d[i];
+  }
+  float threshold = threshold_;
+  float scale = scale_;
+  float offset = offset_;
+  const int block_size = 256;
+  const int grid_size = (num + block_size - 1) / block_size;
+  const float *input = static_cast<const float *>(inputs[0]);
+  float *output = static_cast<float *>(outputs[0]);
+  hard_swish_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
+      threshold, scale, offset, num, input, output);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+nvinfer1::DataType HardSwishPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Elementwise Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
+  return input_types[0];
+}
+
+bool HardSwishPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
+    int nb_outputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of swish plugin shoule not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  (in_out && pos < (nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc &in = in_out[pos];
+  if (pos == 0) {
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+  }
+  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+#endif
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
index 5dfa00ef1c204..c0ee608c39dab 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
@@ -94,6 +94,113 @@ class HardSwishPluginCreator : public TensorRTPluginCreator {
 };
 REGISTER_TRT_PLUGIN_V2(HardSwishPluginCreator);
 
+#if IS_TRT_VERSION_GE(6000)
+class HardSwishPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  HardSwishPluginDynamic(const float threshold, const float scale,
+                         const float offset)
+      : threshold_(threshold), scale_(scale), offset_(offset) {}
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  HardSwishPluginDynamic(void const* serialData, size_t serialLength) {
+    DeserializeValue(&serialData, &serialLength, &threshold_);
+    DeserializeValue(&serialData, &serialLength, &scale_);
+    DeserializeValue(&serialData, &serialLength, &offset_);
+  }
+  ~HardSwishPluginDynamic() {}
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
+    return new HardSwishPluginDynamic(threshold_, scale_, offset_);
+  }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "hard_swish_plugin_dynamic";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
+    return SerializedSize(threshold_) + SerializedSize(scale_) +
+           SerializedSize(offset_);
+  }
+
+  // TRT will call this func  to serialize the configuration of TRT
+  // It should not be called by users.
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    SerializeValue(&buffer, threshold_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, offset_);
+  }
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) TRT_NOEXCEPT override {}
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ protected:
+  float threshold_;
+  float scale_;
+  float offset_;
+};
+
+class HardSwishPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  HardSwishPluginDynamicCreator() {}
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "hardswish_plugin_dynamic";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    auto plugin = new HardSwishPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(HardSwishPluginDynamicCreator);
+
+#endif
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 0d978939c4bf3..6bae3606afe0e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -233,6 +233,24 @@ __global__ void apply_scale(T *data, T scale, int n) {
 #endif
 }
 
+inline int round_up(int seq_len, int multiple = 32) {
+  PADDLE_ENFORCE_GT(
+      multiple, 0,
+      platform::errors::InvalidArgument(
+          "multiple should be a positive number，but it's (%d)", multiple));
+  return ((seq_len + multiple - 1) / multiple) * multiple;
+}
+
+template <typename T>
+__global__ void broadcast(const T *src, T *dst, const int seq_len,
+                          const int head_num) {
+  int batch_id = blockIdx.x / (head_num * seq_len);
+  int dst_offset = blockIdx.x * seq_len;
+  if (threadIdx.x < seq_len) {
+    dst[threadIdx.x + dst_offset] = src[threadIdx.x + batch_id * seq_len];
+  }
+}
+
 int QkvToContextPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
@@ -258,7 +276,21 @@ int QkvToContextPluginDynamic::enqueue(
     auto *tptr = multihead_temp_data + scratch_size;
 
     const float *input0_data = static_cast<const float *>(inputs[0]);
-    const float *input1_data = static_cast<const float *>(inputs[1]);
+    // fit to [batch, head_num, length, length] + [batch, 1, 1, length]
+    framework::Tensor temp_qk_bias_tensor;
+    float *qk_bias = const_cast<float *>(static_cast<const float *>(inputs[1]));
+    if (ProductDim(input_desc[1].dims) == (batch * seq_len)) {
+      temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
+      auto *temp_qk_bias = temp_qk_bias_tensor.mutable_data<float>(
+          platform::CUDAPlace(device_id));
+      int grid = batch * head_number_ * seq_len;
+      int block = round_up(seq_len);
+      broadcast<<<grid, block, 0, stream>>>(
+          static_cast<const float *>(inputs[1]), temp_qk_bias, seq_len,
+          head_number_);
+      qk_bias = temp_qk_bias;
+    }
+    const float *input1_data = static_cast<const float *>(qk_bias);
     // BxSx3xNxH => tptr: 3xBxNxSxH.
     TransposeQKV(batch, seq_len, head_size_, head_number_, input0_data, tptr,
                  stream);
@@ -290,7 +322,22 @@ int QkvToContextPluginDynamic::enqueue(
     half *tptr = qkptr + scratch_size;
 
     const half *input0_data = static_cast<const half *>(inputs[0]);
-    const half *input1_data = static_cast<const half *>(inputs[1]);
+    // fit to [batch, head_num, length, length] + [batch, 1, 1, length]
+    framework::Tensor temp_qk_bias_tensor;
+    half *qk_bias = const_cast<half *>(static_cast<const half *>(inputs[1]));
+    if (ProductDim(input_desc[1].dims) == (batch * seq_len)) {
+      temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
+      auto *temp_qk_bias =
+          reinterpret_cast<half *>(temp_qk_bias_tensor.mutable_data<int16_t>(
+              platform::CUDAPlace(device_id)));
+      int grid = batch * head_number_ * seq_len;
+      int block = round_up(seq_len);
+      broadcast<<<grid, block, 0, stream>>>(
+          static_cast<const half *>(inputs[1]), temp_qk_bias, seq_len,
+          head_number_);
+      qk_bias = temp_qk_bias;
+    }
+    const half *input1_data = static_cast<const half *>(qk_bias);
     // BxSx3xNxH => tptr: 3xBxNxSxH.
     TransposeQKV(batch, seq_len, head_size_, head_number_, input0_data, tptr,
                  stream);
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 50d56b3e59ae7..9d590509a1eb6 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -93,20 +93,42 @@ if (USE_TENSORRT AND WITH_GPU)
   file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
   string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
     "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION
+    "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION
+    "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION
+    "${TENSORRT_VERSION_FILE_CONTENTS}")
   if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
     file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS)
     string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
       "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION
+      "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION
+      "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION
+      "${TENSORRT_VERSION_FILE_CONTENTS}")
   endif()
   if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
     message(SEND_ERROR "Failed to detect TensorRT version.")
   endif()
   string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
     TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+  string(REGEX REPLACE "define NV_TENSORRT_MINOR +([0-9]+)" "\\1"
+    TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}")
+  string(REGEX REPLACE "define NV_TENSORRT_PATCH +([0-9]+)" "\\1"
+    TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}")
+  string(REGEX REPLACE "define NV_TENSORRT_BUILD +([0-9]+)" "\\1"
+    TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}")
   message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
-    "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+    "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} ")
   include_directories("${TENSORRT_INCLUDE_DIR}")
   link_directories("${TENSORRT_LIB_DIR}")
+  add_compile_definitions(NV_TENSORRT_MAJOR=${TENSORRT_MAJOR_VERSION})
+  add_compile_definitions(NV_TENSORRT_MINOR=${TENSORRT_MINOR_VERSION})
+  add_compile_definitions(NV_TENSORRT_PATCH=${TENSORRT_PATCH_VERSION})
+  add_compile_definitions(NV_TENSORRT_BUILD=${TENSORRT_BUILD_VERSION})
 endif()
 
 if(WITH_MKL)
diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh
index 1547071e75d49..dd4b64f28d739 100755
--- a/paddle/fluid/inference/tests/infer_ut/run.sh
+++ b/paddle/fluid/inference/tests/infer_ut/run.sh
@@ -115,6 +115,20 @@ for model_name in $unknown_download_list; do
     download $url_prefix $model_name
 done
 
+# ernie int8 quant with matmul
+unknown_nlp_download_list='quant_post_model_xnli_predict_matmul'
+for model_name in $unknown_nlp_download_list; do
+    url_prefix="https://paddle-qa.bj.bcebos.com/inference_model/unknown/nlp"
+    download $url_prefix $model_name
+done
+
+# mobilnetv1 with prune op attribute
+dev_class_download_list='MobileNetV1'
+for model_name in $dev_class_download_list; do
+    url_prefix="https://paddle-qa.bj.bcebos.com/inference_model/2021-09-16/class"
+    download $url_prefix $model_name
+done
+
 function compile_test() {
     mkdir -p ${build_dir}
     cd ${build_dir}
@@ -255,6 +269,31 @@ if [ $? -ne 0 ]; then
     EXIT_CODE=8
 fi
 
+printf "${YELLOW} start test_ernie_xnli_int8 ${NC} \n";
+compile_test "test_ernie_xnli_int8"
+ernie_qat_model="quant_post_model_xnli_predict_matmul"
+${exe_dir}/test_ernie_xnli_int8 \
+    --modeldir=$DATA_DIR/$ernie_qat_model/$ernie_qat_model \
+    --datadir=$DATA_DIR/$ernie_qat_model/$ernie_qat_model/xnli_var_len \
+    --truth_data=$DATA_DIR/$ernie_qat_model/$ernie_qat_model/truth_data \
+    --gtest_filter=${test_suite_list} \
+    --gtest_output=xml:${log_dir}/test_ernie_xnli_int8.xml
+if [ $? -ne 0 ]; then
+    echo "${RED} test_ernie_xnli_int8 runs failed ${NC}" >> ${exe_dir}/test_summary.txt
+    EXIT_CODE=8
+fi
+
+printf "${YELLOW} start test_mobilnetv1 ${NC} \n";
+compile_test "test_mobilnetv1"
+${exe_dir}/test_mobilnetv1 \
+    --modeldir=$DATA_DIR/MobileNetV1/MobileNetV1 \
+    --gtest_filter=${test_suite_list} \
+    --gtest_output=xml:${log_dir}/test_mobilnetv1.xml
+if [ $? -ne 0 ]; then
+    echo "${RED} test_mobilnetv1 runs failed ${NC}" >> ${exe_dir}/test_summary.txt
+    EXIT_CODE=8
+fi
+
 set +x
 
 test_suites=$(echo ${test_suite_list} | sed 's/:/ /g')
diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
new file mode 100644
index 0000000000000..9e83551126552
--- /dev/null
+++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
@@ -0,0 +1,194 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test_helper.h"  // NOLINT
+#include "test_suite.h"   // NOLINT
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(datadir, "", "dataset.");
+DEFINE_string(truth_data, "", "Directory of the inference data truth result");
+
+namespace paddle_infer {
+
+std::shared_ptr<Predictor> InitPredictor() {
+  Config config;
+  config.SetModel(FLAGS_modeldir + "/__model__",
+                  FLAGS_modeldir + "/__params__");
+  config.EnableUseGpu(1000, 0);
+  // Open the memory optim.
+  config.EnableMemoryOptim();
+
+  int max_batch = 32;
+  int max_single_seq_len = 128;
+  int opt_single_seq_len = 64;
+  int min_batch_seq_len = 1;
+  int max_batch_seq_len = 512;
+  int opt_batch_seq_len = 256;
+
+  std::string input_name0 = "eval_placeholder_0";
+  std::string input_name1 = "eval_placeholder_1";
+  std::string input_name2 = "eval_placeholder_2";
+  std::string input_name3 = "eval_placeholder_3";
+
+  std::vector<int> min_shape = {min_batch_seq_len};
+  std::vector<int> max_shape = {max_batch_seq_len};
+  std::vector<int> opt_shape = {opt_batch_seq_len};
+  // Set the input's min, max, opt shape
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {input_name0, min_shape},
+      {input_name1, min_shape},
+      {input_name2, {1}},
+      {input_name3, {1, min_batch_seq_len, 1}}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {input_name0, max_shape},
+      {input_name1, max_shape},
+      {input_name2, {max_batch + 1}},
+      {input_name3, {1, max_single_seq_len, 1}}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {input_name0, opt_shape},
+      {input_name1, opt_shape},
+      {input_name2, {max_batch + 1}},
+      {input_name3, {1, opt_single_seq_len, 1}}};
+
+  // only kHalf supported
+  config.EnableTensorRtEngine(1 << 30, 1, 5, Config::Precision::kInt8, false,
+                              false);
+  // erinie varlen must be used with dynamic shape
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
+  // erinie varlen must be used with oss
+  config.EnableTensorRtOSS();
+
+  return CreatePredictor(config);
+}
+
+// Parse tensor from string
+template <typename T>
+std::vector<T> ParseTensor(const std::string &field) {
+  std::string mat_str = field;
+
+  std::vector<T> mat;
+  paddle::test::Split(mat_str, ' ', &mat);
+
+  return mat;
+}
+
+void run(Predictor *predictor, std::vector<float> *out_data) {
+  clock_t start, end;
+  start = clock();
+  CHECK(predictor->Run());
+  end = clock();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_t->CopyToCpu(out_data->data());
+  return;
+}
+
+auto PrepareOutput(std::string input_file) -> std::deque<float> {
+  std::ifstream fin(input_file);
+  std::string line;
+  std::vector<std::string> buffer;
+  while (std::getline(fin, line)) {
+    buffer.emplace_back(line);
+  }
+  std::deque<float> resDeque(buffer.size());
+  std::transform(buffer.begin(), buffer.end(), resDeque.begin(),
+                 [](const std::string &val) { return std::stof(val); });
+
+  return resDeque;
+}  // PrepareOutput
+
+TEST(tensorrt_tester_ernie_xnli, oss_varlen_truth_data_int8) {
+  auto resDeque = PrepareOutput(FLAGS_truth_data);
+  auto predictor = InitPredictor();
+
+  ASSERT_FALSE(FLAGS_datadir.empty());
+  std::ifstream fin(FLAGS_datadir);
+  std::string line;
+
+  int lineno = 0;
+  const int max_seq_len = 128;
+  const int run_batch = 1;
+  int correct_num = 0;
+  while (std::getline(fin, line)) {
+    std::vector<std::string> fields;
+    paddle::test::Split(line, ';', &fields);
+
+    auto src_ids = ParseTensor<int32_t>(fields[0]);
+    auto sent_ids = ParseTensor<int32_t>(fields[1]);
+    auto pos_ids = ParseTensor<int64_t>(fields[2]);
+
+    int run_seq_len = src_ids.size();
+    int32_t i3[2] = {0, run_seq_len};
+    int32_t i4[max_seq_len] = {0};
+
+    auto input_names = predictor->GetInputNames();
+
+    // first input
+    auto input_t1 = predictor->GetInputHandle(input_names[0]);
+    input_t1->Reshape({run_seq_len});
+    input_t1->CopyFromCpu(src_ids.data());
+
+    // second input
+    auto input_t2 = predictor->GetInputHandle(input_names[1]);
+    input_t2->Reshape({run_seq_len});
+    input_t2->CopyFromCpu(sent_ids.data());
+
+    // third input
+    auto input_t3 = predictor->GetInputHandle(input_names[2]);
+    input_t3->Reshape({run_batch + 1});
+    input_t3->CopyFromCpu(i3);
+
+    // fourth input
+    auto input_t4 = predictor->GetInputHandle(input_names[3]);
+    input_t4->Reshape({1, max_seq_len, 1});
+    input_t4->CopyFromCpu(i4);
+
+    std::vector<float> out_data;
+    run(predictor.get(), &out_data);
+
+    lineno++;
+    int maxPosition =
+        max_element(out_data.begin(), out_data.end()) - out_data.begin();
+
+    if (maxPosition == resDeque[0]) {
+      correct_num += 1;
+    }
+    resDeque.pop_front();
+
+    VLOG(2) << "predict result: " << maxPosition;
+    for (auto r : out_data) {
+      VLOG(2) << r;
+    }
+  }
+  ASSERT_GT(correct_num,
+            3855);  // total input 5010, int8 res should greater than 3855
+  LOG(INFO) << "=== finish oss test ===";
+}
+
+}  // namespace paddle_infer
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::google::ParseCommandLineFlags(&argc, &argv, true);
+#if IS_TRT_VERSION_GE(7200)
+  return RUN_ALL_TESTS();
+#endif
+  return 0;
+}
diff --git a/paddle/fluid/inference/tests/infer_ut/test_helper.h b/paddle/fluid/inference/tests/infer_ut/test_helper.h
new file mode 100644
index 0000000000000..4732f95543bc4
--- /dev/null
+++ b/paddle/fluid/inference/tests/infer_ut/test_helper.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace test {
+
+// split string to vector<string> by sep
+static void split(const std::string &str, char sep,
+                  std::vector<std::string> *pieces, bool ignore_null = true) {
+  pieces->clear();
+  if (str.empty()) {
+    if (!ignore_null) {
+      pieces->push_back(str);
+    }
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to multiple vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+}  // namespace test
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc b/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc
new file mode 100644
index 0000000000000..21991d0da06a1
--- /dev/null
+++ b/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test_helper.h"  // NOLINT
+#include "test_suite.h"   // NOLINT
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+
+namespace paddle_infer {
+
+paddle::test::Record PrepareInput(int batch_size, int shape_size = 224) {
+  // init input data
+  int channel = 3;
+  int width = shape_size;   // w = 224
+  int height = shape_size;  // h = 224
+  paddle::test::Record image_Record;
+  int input_num = batch_size * channel * width * height;
+  std::vector<float> input_data(input_num, 1);
+  image_Record.data = input_data;
+  image_Record.shape = std::vector<int>{batch_size, channel, width, height};
+  image_Record.type = paddle::PaddleDType::FLOAT32;
+  return image_Record;
+}
+
+TEST(tensorrt_tester_mobilenetv1, tuned_dynamic_trt_fp32_bz2) {
+  bool tuned_shape = true;
+  std::string shape_range_info = FLAGS_modeldir + "/shape_range_info.pbtxt";
+  LOG(INFO) << "tensorrt tuned info saved to " << shape_range_info;
+
+  // init input data
+  std::map<std::string, paddle::test::Record> my_input_data_map;
+  my_input_data_map["x"] = PrepareInput(2, 448);
+  // init output data
+  std::map<std::string, paddle::test::Record> infer_output_data,
+      truth_output_data;
+  if (tuned_shape) {
+    // NOTE: shape_range_info will be saved after destructor of predictor
+    // function
+    // prepare groudtruth config
+    paddle_infer::Config tune_config;
+    tune_config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                         FLAGS_modeldir + "/inference.pdiparams");
+    tune_config.SwitchIrOptim(false);
+    tune_config.EnableUseGpu(1000, 0);
+    tune_config.CollectShapeRangeInfo(shape_range_info);
+
+    auto predictor_tune = paddle_infer::CreatePredictor(tune_config);
+    SingleThreadPrediction(predictor_tune.get(), &my_input_data_map,
+                           &truth_output_data, 1);
+  }
+
+  // prepare inference config
+  paddle_infer::Config config;
+  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                  FLAGS_modeldir + "/inference.pdiparams");
+  config.EnableUseGpu(1000, 0);
+  config.EnableTensorRtEngine(
+      1 << 20, 2, 5, paddle_infer::PrecisionType::kFloat32, false, false);
+  config.EnableTunedTensorRtDynamicShape(shape_range_info, true);
+  LOG(INFO) << config.Summary();
+  paddle_infer::services::PredictorPool pred_pool(config, 1);
+  SingleThreadPrediction(pred_pool.Retrive(0), &my_input_data_map,
+                         &infer_output_data);
+  // check outputs
+  CompareRecord(&truth_output_data, &infer_output_data);
+  VLOG(1) << "finish test";
+}
+
+}  // namespace paddle_infer
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h
index 0b580cd7c7e86..a5c8c52402180 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_suite.h
+++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h
@@ -14,6 +14,7 @@
 #pragma once
 #include <math.h>
 #include <algorithm>
+#include <deque>
 #include <fstream>
 #include <future>
 #include <iostream>
@@ -31,6 +32,18 @@
 namespace paddle {
 namespace test {
 
+#define IS_TRT_VERSION_GE(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
+
+#define IS_TRT_VERSION_LT(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) < version)
+
+#define TRT_VERSION                                    \
+  NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+      NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD
+
 class Record {
  public:
   std::vector<float> data;
@@ -96,7 +109,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor,
 
     switch (output_tensor->type()) {
       case paddle::PaddleDType::INT64: {
-        std::cout << "int64" << std::endl;
+        VLOG(1) << "output_tensor dtype: int64";
         std::vector<int64_t> out_data;
         output_Record.type = paddle::PaddleDType::INT64;
         out_data.resize(out_num);
@@ -108,7 +121,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor,
         break;
       }
       case paddle::PaddleDType::FLOAT32: {
-        std::cout << "float32" << std::endl;
+        VLOG(1) << "output_tensor dtype: float32";
         std::vector<float> out_data;
         output_Record.type = paddle::PaddleDType::FLOAT32;
         out_data.resize(out_num);
@@ -119,7 +132,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor,
         break;
       }
       case paddle::PaddleDType::INT32: {
-        std::cout << "int32" << std::endl;
+        VLOG(1) << "output_tensor dtype: int32";
         std::vector<int32_t> out_data;
         output_Record.type = paddle::PaddleDType::INT32;
         out_data.resize(out_num);
@@ -139,10 +152,12 @@ void CompareRecord(std::map<std::string, Record> *truth_output_data,
                    float epislon = 1e-5) {
   for (const auto & [ key, value ] : *infer_output_data) {
     auto truth_record = (*truth_output_data)[key];
-    LOG(INFO) << "output name: " << key;
+    VLOG(1) << "output name: " << key;
     size_t numel = value.data.size() / sizeof(float);
     EXPECT_EQ(value.data.size(), truth_record.data.size());
     for (size_t i = 0; i < numel; ++i) {
+      VLOG(1) << "compare: " << value.data.data()[i] << ",\t"
+              << truth_record.data.data()[i];
       ASSERT_LT(fabs(value.data.data()[i] - truth_record.data.data()[i]),
                 epislon);
     }
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 9a0637453f03f..6b4afae9f8c75 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -99,7 +99,7 @@ cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc
 
 cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
 
-cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
+cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags)
 cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
 cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index bfc4a1d598200..78bce53b6f4ff 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -37,14 +37,15 @@
 #endif
 #include "paddle/fluid/platform/npu_info.h"
 
-DEFINE_int64(
+PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
     "The retry time (milliseconds) when allocator fails "
     "to allocate memory. No retry if this value is not greater than 0");
 
-DEFINE_bool(use_system_allocator, false,
-            "Whether to use system allocator to allocate CPU and GPU memory. "
-            "Only used for unittests.");
+PADDLE_DEFINE_EXPORTED_bool(
+    use_system_allocator, false,
+    "Whether to use system allocator to allocate CPU and GPU memory. "
+    "Only used for unittests.");
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index cca29797bb68c..a35d8a73f7eda 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -17,18 +17,21 @@
 #include <algorithm>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
-
-DEFINE_bool(free_idle_chunk, false,
-            "Whether to free idle chunk when each allocation is freed. "
-            "If false, all freed allocation would be cached to speed up next "
-            "allocation request. If true, no allocation would be cached. This "
-            "flag only works when FLAGS_allocator_strategy=auto_growth.");
-
-DEFINE_bool(free_when_no_cache_hit, false,
-            "Whether to free idle chunks when no cache hit. If true, idle "
-            "chunk would be freed when no cache hit; if false, idle "
-            "chunk would be freed when out of memory occurs. This flag "
-            "only works when FLAGS_allocator_strategy=auto_growth.");
+#include "paddle/fluid/platform/flags.h"
+
+PADDLE_DEFINE_EXPORTED_READONLY_bool(
+    free_idle_chunk, false,
+    "Whether to free idle chunk when each allocation is freed. "
+    "If false, all freed allocation would be cached to speed up next "
+    "allocation request. If true, no allocation would be cached. This "
+    "flag only works when FLAGS_allocator_strategy=auto_growth.");
+
+PADDLE_DEFINE_EXPORTED_READONLY_bool(
+    free_when_no_cache_hit, false,
+    "Whether to free idle chunks when no cache hit. If true, idle "
+    "chunk would be freed when no cache hit; if false, idle "
+    "chunk would be freed when out of memory occurs. This flag "
+    "only works when FLAGS_allocator_strategy=auto_growth.");
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index 3fff18b9bc39d..acaf5d548555c 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -59,7 +59,7 @@ std::string GetIPCName() {
 #endif
   handle += "_";
   handle += std::to_string(rd());
-  return std::move(handle);
+  return handle;
 }
 
 std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 9cd35ad8ad9da..2c00b34dd1353 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -34,12 +34,13 @@
 #include "paddle/fluid/platform/xpu/xpu_header.h"
 #endif
 
-DEFINE_bool(init_allocated_mem, false,
-            "It is a mistake that the values of the memory allocated by "
-            "BuddyAllocator are always zeroed in some op's implementation. "
-            "To find this error in time, we use init_allocated_mem to indicate "
-            "that initializing the allocated memory with a small value "
-            "during unit testing.");
+PADDLE_DEFINE_EXPORTED_bool(
+    init_allocated_mem, false,
+    "It is a mistake that the values of the memory allocated by "
+    "BuddyAllocator are always zeroed in some op's implementation. "
+    "To find this error in time, we use init_allocated_mem to indicate "
+    "that initializing the allocated memory with a small value "
+    "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4a82f558ff4ec..0d7d0a5e13bf3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -59,6 +59,10 @@ if (WITH_GPU)
     endif()
 endif()
 
+if (WITH_POCKETFFT)
+    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} pocketfft)
+endif()
+
 
 SET(OP_MKL_DEPS "")
 if (NOT WITH_MKL OR NOT WITH_AVX)
@@ -75,7 +79,7 @@ if(WITH_UNITY_BUILD)
 endif()
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
-        sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
+        sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 
@@ -94,6 +98,12 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
+if (WITH_GPU AND (NOT WITH_ROCM))
+    op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS})
+else()
+    op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS})
+endif()
+
 op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
 op_library(eye_op DEPS ${OP_HEADER_DEPS})
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
@@ -108,12 +118,12 @@ endif()
 
 cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
 sequence_pooling segment_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
diff --git a/paddle/fluid/operators/arg_max_op_xpu.cc b/paddle/fluid/operators/arg_max_op_xpu.cc
new file mode 100644
index 0000000000000..8060b5cf755c0
--- /dev/null
+++ b/paddle/fluid/operators/arg_max_op_xpu.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ArgMaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto dtype = ctx.Attr<int>("dtype");
+    PADDLE_ENFORCE_EQ(
+        (dtype < 0 || dtype == 3), true,
+        platform::errors::InvalidArgument(
+            "The attribute of dtype in xpu argmin/argmax must be [%s], but "
+            "received [%s]",
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT64),
+            paddle::framework::DataTypeToString(
+                static_cast<framework::proto::VarType::Type>(dtype))));
+
+    out->template mutable_data<int64_t>(ctx.GetPlace());
+    auto axis = ctx.Attr<int64_t>("axis");
+    const bool& flatten = ctx.Attr<bool>("flatten");
+    framework::DDim x_dims;
+    if (flatten) {
+      x_dims = framework::make_ddim({x->numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      x_dims = x->dims();
+      if (axis < 0) axis += x_dims.size();
+    }
+    auto xdims_vec = framework::vectorize<int>(x_dims);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::argmax(dev_ctx.x_context(), x->data<T>(), out->data<int64_t>(),
+                        xdims_vec, axis);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU argmax kernel return wrong value[%d %s].", r,
+                          XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    arg_max, ops::ArgMaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 4783aa3a86fb3..a400d27b798e3 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
 
+#include <paddle/fluid/platform/complex.h>
 #include <memory>
 #include <string>
 #include <vector>
@@ -237,7 +238,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::ConcatKernel<paddle::platform::CPUDeviceContext,
                       paddle::platform::float16>,
     ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ConcatKernel<paddle::platform::CPUDeviceContext, uint8_t>);
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext,
+                      paddle::platform::complex<float>>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext,
+                      paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
@@ -247,4 +252,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext,
                           paddle::platform::float16>,
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, uint8_t>);
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext,
+                          paddle::platform::complex<float>>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext,
+                          paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
index 63025c3bd030f..2be7632985754 100644
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -24,7 +25,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ConcatKernel<paddle::platform::CUDADeviceContext, uint8_t>);
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext,
+                      plat::complex<float>>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext,
+                      plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     concat_grad,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
@@ -33,4 +38,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, uint8_t>);
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext,
+                          plat::complex<float>>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext,
+                          plat::complex<double>>);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 4cd20e6ecd5e6..1610705c4694c 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -50,6 +50,15 @@ std::vector<int64_t> ConvOp::ComputeOutputShape(
       ctx->Attrs().Get<std::string>("padding_algorithm");
   int groups = ctx->Attrs().Get<int>("groups");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+  int dilation_size = dilations.size();
+  for (int i = 0; i < dilation_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        dilations[i], 0,
+        platform::errors::InvalidArgument(
+            "The dilation of Op(Conv) should be larget than 0, but received "
+            "dilation is %d.",
+            dilations[i]));
+  }
   const std::string data_format = ctx->Attrs().Get<std::string>("data_format");
 
   // MKL-DNN Kernels are using NCHW order of dims description
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index efbd653ffd3b0..c04d04f841388 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -17,8 +17,6 @@ endfunction()
 
 detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
 detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
-detection_library(iou_similarity_op SRCS iou_similarity_op.cc
-iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
@@ -58,6 +56,12 @@ else()
   detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc)
 endif()
 
+if(WITH_XPU)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc)
+else()
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu)
+endif()
+
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
 # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc b/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc
new file mode 100644
index 0000000000000..59238b92c5085
--- /dev/null
+++ b/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/detection/iou_similarity_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class XPUIOUSimilarityKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
+    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
+    bool normalized = ctx.Attr<bool>("box_normalized");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+
+    int x_n = in_x->dims()[0];
+    int y_n = in_y->dims()[0];
+    T eps = static_cast<T>(1e-10);
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::iou_similarity(
+        dev_ctx.x_context(), in_x->data<T>(), in_y->data<T>(),
+        out->mutable_data<T>(ctx.GetPlace()), x_n, y_n, eps, normalized);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU iou_similarity kernel return wrong value[%d %s].", r,
+            XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using XPU = paddle::platform::XPUDeviceContext;
+
+REGISTER_OP_XPU_KERNEL(iou_similarity, ops::XPUIOUSimilarityKernel<XPU, float>);
+
+#endif
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
new file mode 100644
index 0000000000000..98247fbc862bb
--- /dev/null
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -0,0 +1,194 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/determinant_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DeterminantOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
+  }
+};
+
+class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "(Tensor) The input tensor of determinant.");
+    AddOutput("Out",
+              "(Tensor) The output Tensor containing the determinant"
+              "value of a square matrix or batches of square matrices ");
+
+    AddComment(R"DOC(
+Determinant Operator.)DOC");
+  }
+};
+
+class DeterminantGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
+                   "DeterminantGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "DeterminantGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
+                   framework::GradVarName("Input"), "DeterminantGradOp");
+
+    ctx->SetOutputDim(framework::GradVarName("Input"),
+                      ctx->GetInputDim("Input"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class DeterminantGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("determinant_grad");
+    grad_op->SetInput("Input", this->Input("Input"));
+    grad_op->SetInput("Out", this->Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("Input"),
+                       this->InputGrad("Input"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(DeterminantGradNoNeedBufferVarsInferer,
+                                    "Input");
+
+class SlogDeterminantOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
+  }
+};
+
+class SlogDeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "(Tensor) The input tensor of SlogDeterminant.");
+    AddOutput("Out",
+              "(Tensor) The output tensor containing the sign of the"
+              "determinant and the natural logarithm"
+              "of the absolute value of determinant,");
+
+    AddComment(R"DOC(
+SlogDeterminant Operator.)DOC");
+  }
+};
+
+class SlogDeterminantGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
+                   "SlogDeterminantGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out",
+                   "SlogDeterminantGradOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "SlogDeterminantGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
+                   framework::GradVarName("Input"), "SlogDeterminantGradOp");
+
+    ctx->SetOutputDim(framework::GradVarName("Input"),
+                      ctx->GetInputDim("Input"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class SlogDeterminantGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("slogdeterminant_grad");
+    grad_op->SetInput("Input", this->Input("Input"));
+    grad_op->SetInput("Out", this->Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("Input"),
+                       this->InputGrad("Input"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
+                                    "Input");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker,
+                  ops::DeterminantGradOpMaker<paddle::framework::OpDesc>,
+                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp)
+
+REGISTER_OP_CPU_KERNEL(determinant,
+                       ops::DeterminantKernel<plat::CPUDeviceContext, float>,
+                       ops::DeterminantKernel<plat::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    determinant_grad, ops::DeterminantGradKernel<plat::CPUDeviceContext, float>,
+    ops::DeterminantGradKernel<plat::CPUDeviceContext, double>);
+
+REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp,
+                  ops::SlogDeterminantOpMaker,
+                  ops::SlogDeterminantGradOpMaker<paddle::framework::OpDesc>,
+                  ops::SlogDeterminantGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(slogdeterminant_grad,
+                  ops::SlogDeterminantGradOp)  // reuse det grad op
+
+REGISTER_OP_CPU_KERNEL(
+    slogdeterminant, ops::SlogDeterminantKernel<plat::CPUDeviceContext, float>,
+    ops::SlogDeterminantKernel<plat::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    slogdeterminant_grad,
+    ops::SlogDeterminantGradKernel<plat::CPUDeviceContext, float>,
+    ops::SlogDeterminantGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu
new file mode 100644
index 0000000000000..d19d4c3d09386
--- /dev/null
+++ b/paddle/fluid/operators/determinant_op.cu
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/determinant_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    determinant, ops::DeterminantKernel<plat::CUDADeviceContext, float>,
+    ops::DeterminantKernel<plat::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    determinant_grad,
+    ops::DeterminantGradKernel<plat::CUDADeviceContext, float>,
+    ops::DeterminantGradKernel<plat::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    slogdeterminant, ops::SlogDeterminantKernel<plat::CUDADeviceContext, float>,
+    ops::SlogDeterminantKernel<plat::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    slogdeterminant_grad,
+    ops::SlogDeterminantGradKernel<plat::CUDADeviceContext, float>,
+    ops::SlogDeterminantGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
new file mode 100644
index 0000000000000..4c17869fb5d2a
--- /dev/null
+++ b/paddle/fluid/operators/determinant_op.h
@@ -0,0 +1,436 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <Eigen/Dense>
+#include <Eigen/LU>
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/operators/math/matrix_inverse.h"
+#include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T>
+T sign(T val) {
+  return static_cast<T>(T(0) < val) - (val < T(0));
+}
+
+template <typename T>
+class EigenMatrix {};
+
+template <>
+class EigenMatrix<float> {
+ public:
+  using MatrixType = Eigen::MatrixXf;
+};
+
+template <>
+class EigenMatrix<double> {
+ public:
+  using MatrixType = Eigen::MatrixXd;
+};
+
+inline int64_t GetBatchCount(const framework::DDim dims) {
+  int64_t batch_count = 1;
+  auto dim_size = dims.size();
+  PADDLE_ENFORCE_GE(
+      dim_size, 2,
+      platform::errors::InvalidArgument(
+          "the input matrix dimension size should greater than 2."));
+
+  // Cumulative multiplying each dimension until the last 2 to get the batch
+  // count,
+  // for example a tensor with shape [3,3,3,3], the batch count of matrices is
+  // 9.
+  for (int64_t i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+
+  return batch_count;
+}
+
+template <typename T>
+struct DeterminantFunctor {
+  void operator()(const Tensor& input, const framework::ExecutionContext ctx,
+                  int64_t rank, int64_t batch_count, Tensor* output) {
+    std::vector<T> input_vec;
+    std::vector<T> output_vec;
+    framework::TensorToVector(input, ctx.device_context(), &input_vec);
+    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
+      auto begin_iter = input_vec.begin() + i * rank * rank;
+      auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
+      std::vector<T> sub_vec(begin_iter,
+                             end_iter);  // get every square matrix data
+      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
+      for (int64_t i = 0; i < rank; ++i) {
+        for (int64_t j = 0; j < rank; ++j) {
+          matrix(i, j) = sub_vec[rank * i + j];
+        }
+      }
+      output_vec.push_back(matrix.determinant());
+    }
+    framework::TensorFromVector(output_vec, output);
+  }
+};
+template <typename DeviceContext, typename T>
+class DeterminantKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<framework::Tensor>("Input");
+    auto input_dim = vectorize(input->dims());
+    auto input_dim_size = input_dim.size();
+    auto* output = context.Output<framework::Tensor>("Out");
+
+    auto batch_count = GetBatchCount(input->dims());
+    VLOG(2) << "input dim:" << input->dims();
+    PADDLE_ENFORCE_GE(
+        input_dim_size, 2,
+        platform::errors::InvalidArgument(
+            "the input matrix dimension size should greater than 2."));
+    PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1],
+                      input_dim[input_dim_size - 2],
+                      platform::errors::InvalidArgument(
+                          "the input matrix should be square matrix."));
+    auto rank = input_dim[input_dim_size - 1];  // square matrix length
+    DeterminantFunctor<T>()(*input, context, rank, batch_count, output);
+    auto output_dims =
+        framework::slice_ddim(input->dims(), 0, input_dim_size - 2);
+    if (input_dim_size > 2) {
+      output->Resize(output_dims);
+    } else {
+      // when input is a two-dimension matrix, The det value is a number.
+      output->Resize({1});
+    }
+    VLOG(2) << "output dim:" << output->dims();
+  }
+};
+
+template <typename T>
+struct FoundZeroFunctor {
+  FoundZeroFunctor(const T* x, int64_t numel, bool* res)
+      : x_(x), numel_(numel), res_(res) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    if (*res_ || idx >= static_cast<size_t>(numel_)) {
+      // founded zero number
+      return;
+    }
+    *res_ = (x_[idx] == static_cast<T>(0));
+  }
+  const T* x_;
+  int64_t numel_;
+  bool* res_;
+};
+
+template <typename DeviceContext, typename T>
+inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx,
+                                  const framework::Tensor* det) {
+  auto& dev_ctx = ctx.template device_context<DeviceContext>();
+  auto numel = det->numel();
+
+  framework::Tensor dev_tensor;
+  auto* data = dev_tensor.mutable_data<bool>({1}, ctx.GetPlace());
+
+  // set false
+  math::SetConstant<DeviceContext, bool> zero;
+  zero(dev_ctx, &dev_tensor, false);
+
+  // find whether zero
+  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+  FoundZeroFunctor<T> functor(det->data<T>(), numel, data);
+  for_range(functor);
+
+  // copy to host
+  dev_ctx.Wait();
+  framework::Tensor cpu_tensor;
+  framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor);
+
+  // if founded zero, the matrix is not invertible
+  // else the matrix is invertible
+  auto* res = cpu_tensor.data<bool>();
+  return !(*res);
+}
+
+template <typename DeviceContext, typename T>
+class DeterminantGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const auto* input = context.Input<framework::Tensor>("Input");
+    const auto* det = context.Input<framework::Tensor>("Out");
+    const auto* grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* ddet =
+        context.Output<framework::Tensor>(framework::GradVarName("Input"));
+
+    auto input_dims_size = input->dims().size();
+    if (input_dims_size > 2) {
+      PADDLE_ENFORCE_EQ(
+          grad->dims().size() + 2, input_dims_size,
+          platform::errors::InvalidArgument(
+              "The grad tensor of det dims size should 2 less than"
+              " input tensor's, but here differ %d",
+              input_dims_size - grad->dims().size()));
+    } else if (input_dims_size == 2) {
+      // input dims size 2 and grad dims size 1 is possible
+      PADDLE_ENFORCE_EQ(
+          grad->dims().size(), 1,
+          platform::errors::InvalidArgument(
+              "The grad tensor of det dims size should 2 less than"
+              " input tensor's, but here differ %d",
+              input_dims_size - grad->dims().size()));
+    } else {
+      // checked in forward, pass
+    }
+
+    // Check Whether the matrix is invertible
+    // (matrix A not invertible) == (det(A)=0)
+    if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
+      // The matrix is not invertible
+      VLOG(3) << "The input matrix not invertible!";
+      ddet->Resize(input->dims());
+      ddet->mutable_data<T>(context.GetPlace());
+      math::SetConstant<DeviceContext, T> zero;
+      zero(dev_ctx, ddet, static_cast<T>(0.0f));
+      return;
+    }
+
+    // The matrix is invertible
+    // let |A| = Determinant(A)
+    // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
+    // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
+    // -1)
+
+    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
+
+    // First: inverse(A)
+    framework::Tensor inverse_A;
+    // A must be square matrices!
+    inverse_A.Resize(input->dims());
+    inverse_A.mutable_data<T>(context.GetPlace());
+
+    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(dev_ctx, *input, &inverse_A);
+
+    VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
+
+    // Second: inverse(A).transpose(-2, -1)
+    framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A);
+    VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
+            << transpose_inverse_A.dims();
+
+    // Third: dA * |A|
+    auto mul_dA_detA = helper.Mul(*grad, *det);
+    VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
+
+    // Fourth: unsqueeze(dA * |A|, [-1, -2])
+    auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1);
+    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
+
+    // Finally: unsqueeze(dA * |A|) * inverse(A)
+    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+
+    VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
+
+    framework::TensorCopy(res, context.GetPlace(), ddet);
+
+    ddet->Resize(input->dims());
+    VLOG(3) << "d|A| dims: " << ddet->dims();
+  }
+};
+
+template <typename T>
+struct SlogDeterminantFunctor {
+  void operator()(const Tensor& input, const framework::ExecutionContext ctx,
+                  int64_t rank, int64_t batch_count, Tensor* output) {
+    std::vector<T> input_vec;
+    std::vector<T> sign_vec;
+    std::vector<T> log_vec;
+    std::vector<T> output_vec;
+    framework::TensorToVector(input, ctx.device_context(), &input_vec);
+    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
+      auto begin_iter = input_vec.begin() + i * rank * rank;
+      auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
+      std::vector<T> sub_vec(begin_iter,
+                             end_iter);  // get every square matrix data
+      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
+      for (int64_t i = 0; i < rank; ++i) {
+        for (int64_t j = 0; j < rank; ++j) {
+          matrix(i, j) = sub_vec[rank * i + j];
+        }
+      }
+      VLOG(2) << "det value: " << matrix.determinant();
+      VLOG(2) << "matrix val: " << matrix;
+      auto det_val = matrix.determinant();
+      sign_vec.push_back(sign(det_val));
+      det_val >= 0
+          ? log_vec.push_back(std::log(det_val))
+          : log_vec.push_back(std::log(std::abs(
+                det_val)));  // for computing log value of a negative value.
+    }
+    // merge sign_vec and log_vec as final output_vec
+    output_vec.insert(output_vec.end(), sign_vec.begin(), sign_vec.end());
+    output_vec.insert(output_vec.end(), log_vec.begin(), log_vec.end());
+    framework::TensorFromVector(output_vec, output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SlogDeterminantKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<framework::Tensor>("Input");
+    auto input_dim = vectorize(input->dims());
+    auto input_dim_size = input_dim.size();
+    auto* output = context.Output<framework::Tensor>("Out");
+
+    auto batch_count = GetBatchCount(input->dims());
+    VLOG(2) << "input dim:" << input->dims();
+    PADDLE_ENFORCE_GE(
+        input_dim_size, 2,
+        platform::errors::InvalidArgument(
+            "the input matrix dimension size should greater than 2."));
+    PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1],
+                      input_dim[input_dim_size - 2],
+                      platform::errors::InvalidArgument(
+                          "the input matrix should be square matrix."));
+    auto rank = input_dim[input_dim_size - 1];  // square matrix length
+    SlogDeterminantFunctor<T>()(*input, context, rank, batch_count, output);
+    std::vector<int> output_dim_vec(input_dim.begin(), input_dim.end() - 2);
+    if (input_dim.size() == static_cast<size_t>(2)) {
+      // when input is a two-dimension matrix, The det value is a number.
+      output_dim_vec = {1};
+    }
+    output_dim_vec.insert(output_dim_vec.begin(),
+                          2);  // make the output dims as same as numpy
+    auto output_dims = framework::make_ddim(output_dim_vec);
+    output->Resize(output_dims);
+    VLOG(2) << "output dim:" << output->dims();
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SlogDeterminantGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const auto* input = context.Input<framework::Tensor>("Input");
+    const auto* slogdet = context.Input<framework::Tensor>("Out");
+    const auto* grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dslogdet =
+        context.Output<framework::Tensor>(framework::GradVarName("Input"));
+
+    PADDLE_ENFORCE_EQ(grad->dims()[0], 2,
+                      platform::errors::InvalidArgument(
+                          "The grad tensor of SlogDet should contain two"
+                          " grad: sign and absslogdet, but here %ld.",
+                          grad->dims()[0]));
+    if (input->dims().size() > 2) {
+      PADDLE_ENFORCE_EQ(
+          grad->dims().size() + 1, input->dims().size(),
+          platform::errors::InvalidArgument(
+              "The grad tensor of slogdet dims size should 1 less than"
+              " input tensor's, but here differ %d",
+              input->dims().size() - grad->dims().size()));
+    }
+
+    // Check Whether the matrix is invertible
+    // (matrix A not invertible) == (absslogdet(A)=0)
+    auto slogdet_vec = slogdet->Split(1, 0);
+    auto absslogdet_val = slogdet_vec[0];
+    if (!CheckMatrixInvertible<DeviceContext, T>(context, &absslogdet_val)) {
+      // The matrix is not invertible
+      VLOG(3) << "The input matrix not invertible!";
+      dslogdet->Resize(input->dims());
+      dslogdet->mutable_data<T>(context.GetPlace());
+      math::SetConstant<DeviceContext, T> zero;
+      zero(dev_ctx, dslogdet, std::numeric_limits<T>::quiet_NaN());
+      return;
+    }
+
+    // The matrix is invertible
+    // let sl|A| = SlogDeterminant(A)
+    // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
+    // we set dsl|A| = unsqueeze(dslA, [-1, -2]) *
+    // inverse(A).conj().transpose(-2, -1)
+
+    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
+
+    // First: inverse(A)
+    framework::Tensor inverse_A;
+    // A must be square matrices!
+    inverse_A.Resize(input->dims());
+    inverse_A.mutable_data<T>(context.GetPlace());
+
+    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(dev_ctx, *input, &inverse_A);
+
+    VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
+
+    // Second: inverse(A).conj()
+    framework::Tensor conj_inverse_A;
+    conj_inverse_A.Resize(inverse_A.dims());
+    auto numel = input->numel();
+    auto* conj_data = conj_inverse_A.mutable_data<T>(context.GetPlace(),
+                                                     size_t(numel * sizeof(T)));
+
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    math::ConjFunctor<T> functor(inverse_A.data<T>(), numel, conj_data);
+    for_range(functor);
+
+    VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();
+
+    // Third: inverse(A).conj().transpose(-2, -1)
+    framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A);
+    VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: "
+            << transpose_inverse_A.dims();
+
+    // Fourth: split grad value to [sign_grad, absslogdet_grad]
+    auto grad_vec = grad->Split(1, 0);
+    auto det_grad = grad_vec[1];
+
+    // remmove useless first dimension
+    int det_grad_size = det_grad.dims().size();
+    std::vector<int> det_grad_vec;
+    for (int i = 1; i < det_grad_size; ++i) {
+      det_grad_vec.emplace_back(det_grad.dims()[i]);
+    }
+    det_grad.Resize(det_grad.dims().reshape(det_grad_vec));
+
+    // Fifth: unsqueeze(dslA, [-1, -2])
+    auto unsqueeze1 = helper.Unsqueeze(det_grad, -1);
+    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims();
+
+    // Finally: unsqueeze(dslA) * inverse(A)
+    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims();
+
+    framework::TensorCopy(res, context.GetPlace(), dslogdet);
+    dslogdet->Resize(input->dims());
+    VLOG(3) << "dsl|A| dims: " << dslogdet->dims();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/scale.cc b/paddle/fluid/operators/eigen/scale.cc
index e85878f20aa2b..d9fbb878e35ea 100644
--- a/paddle/fluid/operators/eigen/scale.cc
+++ b/paddle/fluid/operators/eigen/scale.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -42,6 +43,8 @@ template struct EigenScale<Eigen::DefaultDevice, int8_t>;
 template struct EigenScale<Eigen::DefaultDevice, int16_t>;
 template struct EigenScale<Eigen::DefaultDevice, int>;
 template struct EigenScale<Eigen::DefaultDevice, int64_t>;
+template struct EigenScale<Eigen::DefaultDevice, platform::complex<float>>;
+template struct EigenScale<Eigen::DefaultDevice, platform::complex<double>>;
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/scale.cu b/paddle/fluid/operators/eigen/scale.cu
index 6a77f72f6200c..5e485799af52c 100644
--- a/paddle/fluid/operators/eigen/scale.cu
+++ b/paddle/fluid/operators/eigen/scale.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -41,6 +42,8 @@ template struct EigenScale<Eigen::GpuDevice, int16_t>;
 template struct EigenScale<Eigen::GpuDevice, int>;
 template struct EigenScale<Eigen::GpuDevice, int64_t>;
 template struct EigenScale<Eigen::GpuDevice, platform::float16>;
+template struct EigenScale<Eigen::GpuDevice, platform::complex<float>>;
+template struct EigenScale<Eigen::GpuDevice, platform::complex<double>>;
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc
index b3056bd43ba53..6835951a2381f 100644
--- a/paddle/fluid/operators/eigh_op.cc
+++ b/paddle/fluid/operators/eigh_op.cc
@@ -47,12 +47,9 @@ class EighOp : public framework::OperatorWithKernel {
             input_dim[rank - 2], input_dim[rank - 1]));
 
     std::vector<int64_t> values_dim;
-    if (rank > 2) {
-      for (auto i = 0; i < rank - 1; i++) {
-        values_dim.emplace_back(input_dim[i]);
-      }
-    } else {
-      values_dim = {input_dim[1]};
+
+    for (auto i = 0; i < rank - 1; i++) {
+      values_dim.emplace_back(input_dim[i]);
     }
 
     ctx->SetOutputDim("Eigenvalues", framework::make_ddim(values_dim));
@@ -99,9 +96,9 @@ class EighGradOp : public framework::OperatorWithKernel {
                    "EighGrad");
     OP_INOUT_CHECK(ctx->HasInput("Eigenvectors"), "Input", "Eigenvectors",
                    "EighGrad");
-    OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Eigenvalues")),
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvalues")),
                    "Input", "Eigenvalues@GRAD", "EighGrad");
-    OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Eigenvectors")),
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvectors")),
                    "Input", "Eigenvectors@GRAD", "EighGrad");
     auto dims = ctx->GetInputDim("Eigenvectors");
     auto x_grad_name = framework::GradVarName("X");
@@ -150,18 +147,17 @@ REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker,
 REGISTER_OPERATOR(eigh_grad, ops::EighGradOp);
 
 REGISTER_OP_CPU_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CPUDeviceContext, float, float>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, double, double>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, float,
+    eigh, ops::EighKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::EighKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::EighKernel<paddle::platform::CPUDeviceContext,
                     paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, double,
+    ops::EighKernel<paddle::platform::CPUDeviceContext,
                     paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
-    eigh_grad,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, float, float>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double, double>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, float,
+    eigh_grad, ops::EighGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
                         paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double,
+    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
                         paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu
index cfc9eba450959..827c551637d4d 100644
--- a/paddle/fluid/operators/eigh_op.cu
+++ b/paddle/fluid/operators/eigh_op.cu
@@ -14,40 +14,19 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/eigh_op.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename ValueType, typename T>
-class EighGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto input_var = ctx.Input<Tensor>("X");
-    auto output_w_var = ctx.Output<Tensor>("Eigenvalues");
-    auto output_v_var = ctx.Output<Tensor>("Eigenvectors");
-    std::string lower = ctx.Attr<std::string>("UPLO");
-    bool is_lower = (lower == "L");
-    math::MatrixEighFunctor<ValueType, T> functor;
-    functor(ctx, *input_var, output_w_var, output_v_var, is_lower, true);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-
 REGISTER_OP_CUDA_KERNEL(
-    eigh, ops::EighGPUKernel<float, float>, ops::EighGPUKernel<double, double>,
-    ops::EighGPUKernel<float, paddle::platform::complex<float>>,
-    ops::EighGPUKernel<double, paddle::platform::complex<double>>);
+    eigh, ops::EighKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::EighKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::EighKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::complex<float>>,
+    ops::EighKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
-    eigh_grad,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, float, float>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double, double>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, float,
+    eigh_grad, ops::EighGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
                         paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double,
+    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
                         paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h
index 0af38d44e5457..ad9b0f598311b 100644
--- a/paddle/fluid/operators/eigh_op.h
+++ b/paddle/fluid/operators/eigh_op.h
@@ -22,57 +22,51 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename ValueType, typename T>
+template <typename DeviceContext, typename T>
 class EighKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input_var = ctx.Input<Tensor>("X");
-    auto output_w_var = ctx.Output<Tensor>("Eigenvalues");
-    auto output_v_var = ctx.Output<Tensor>("Eigenvectors");
+    auto input = ctx.Input<Tensor>("X");
+    auto output_w = ctx.Output<Tensor>("Eigenvalues");
+    auto output_v = ctx.Output<Tensor>("Eigenvectors");
     std::string lower = ctx.Attr<std::string>("UPLO");
     bool is_lower = (lower == "L");
-    math::MatrixEighFunctorCPU<DeviceContext, ValueType, T> functor;
-    functor(ctx, *input_var, output_w_var, output_v_var, is_lower, true);
+    math::MatrixEighFunctor<DeviceContext, T> functor;
+    functor(ctx, *input, output_w, output_v, is_lower, true);
   }
 };
 
-template <typename DeviceContext, typename ValueType, typename T>
+template <typename DeviceContext, typename T>
 class EighGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using ValueType = math::Real<T>;
     auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     x_grad.mutable_data<T>(ctx.GetPlace());
-    auto& output_w_var = *ctx.Input<Tensor>("Eigenvalues");
-    auto& output_v_var = *ctx.Input<Tensor>("Eigenvectors");
+    auto& output_w = *ctx.Input<Tensor>("Eigenvalues");
+    auto& output_v = *ctx.Input<Tensor>("Eigenvectors");
     auto& output_w_grad =
         *ctx.Input<Tensor>(framework::GradVarName("Eigenvalues"));
     auto& output_v_grad =
         *ctx.Input<Tensor>(framework::GradVarName("Eigenvectors"));
 
-    auto& dims = output_v_var.dims();
+    auto& dims = output_v.dims();
     const int m = dims[dims.size() - 1];
     auto dito =
         math::DeviceIndependenceTensorOperations<DeviceContext, T, ValueType>(
             ctx);
-    auto tV = dito.Transpose(dito.Conj(output_v_var));
-    auto W = dito.Sub_(dito.Unsqueeze(output_w_var, -2),
-                       dito.Unsqueeze(output_w_var, -1));
+    auto tV = dito.Transpose(dito.Conj(output_v));
+    auto W = dito.template Sub<ValueType>(dito.Unsqueeze(output_w, -2),
+                                          dito.Unsqueeze(output_w, -1));
     Tensor result = dito.Matmul(tV, output_v_grad);
     result.mutable_data<T>(dims, ctx.GetPlace());
     std::vector<int> out_shape = framework::vectorize<int>(dims);
     auto constant = dito.Fill(out_shape, 0.5);
     result = dito.Sub(result, dito.Conj(dito.Transpose(result)));
     result = dito.Mul(result, constant);
-    result = dito.Div_(result, W);
+    result = dito.Div(result, W);
     result = dito.DiagFill(m, m, m, 0, output_w_grad, result);
-    x_grad = dito.Matmul(output_v_var, dito.Matmul(result, tV));
+    x_grad = dito.Matmul(output_v, dito.Matmul(result, tV));
   }
 };
 
diff --git a/paddle/fluid/operators/eigvals_op.cc b/paddle/fluid/operators/eigvals_op.cc
new file mode 100644
index 0000000000000..dcf350190951e
--- /dev/null
+++ b/paddle/fluid/operators/eigvals_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/eigvals_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class EigvalsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), A complex- or real-valued tensor with shape (*, n, n)"
+             "where * is zero or more batch dimensions");
+    AddOutput("Out",
+              "(Tensor) The output tensor with shape (*,n) cointaining the "
+              "eigenvalues of X.");
+    AddComment(R"DOC(eigvals operator
+        Return the eigenvalues of one or more square matrices. The eigenvalues are complex even when the input matrices are real.
+        )DOC");
+  }
+};
+
+class EigvalsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigvals");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Eigvals");
+
+    DDim x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X) for Eigvals operator "
+                          "should be at least 2, "
+                          "but received X's dimension = %d, X's shape = [%s].",
+                          x_dims.size(), x_dims));
+
+    if (ctx->IsRuntime() || !framework::contain_unknown_dim(x_dims)) {
+      int last_dim = x_dims.size() - 1;
+      PADDLE_ENFORCE_EQ(x_dims[last_dim], x_dims[last_dim - 1],
+                        platform::errors::InvalidArgument(
+                            "The last two dimensions of Input(X) for Eigvals "
+                            "operator should be equal, "
+                            "but received X's shape = [%s].",
+                            x_dims));
+    }
+
+    auto output_dims = vectorize(x_dims);
+    output_dims.resize(x_dims.size() - 1);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+  }
+};
+
+class EigvalsOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const {
+    auto input_dtype = ctx->GetInputDataType("X");
+    auto output_dtype = framework::IsComplexType(input_dtype)
+                            ? input_dtype
+                            : framework::ToComplexType(input_dtype);
+    ctx->SetOutputDataType("Out", output_dtype);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(eigvals, ops::EigvalsOp, ops::EigvalsOpMaker,
+                  ops::EigvalsOpVarTypeInference);
+REGISTER_OP_CPU_KERNEL(eigvals,
+                       ops::EigvalsKernel<plat::CPUDeviceContext, float>,
+                       ops::EigvalsKernel<plat::CPUDeviceContext, double>,
+                       ops::EigvalsKernel<plat::CPUDeviceContext,
+                                          paddle::platform::complex<float>>,
+                       ops::EigvalsKernel<plat::CPUDeviceContext,
+                                          paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h
new file mode 100644
index 0000000000000..6fdf849ac7613
--- /dev/null
+++ b/paddle/fluid/operators/eigvals_op.h
@@ -0,0 +1,219 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/operators/math/lapack_function.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename T, typename enable = void>
+struct PaddleComplex;
+
+template <typename T>
+struct PaddleComplex<
+    T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using type = paddle::platform::complex<T>;
+};
+template <typename T>
+struct PaddleComplex<
+    T, typename std::enable_if<
+           std::is_same<T, platform::complex<float>>::value ||
+           std::is_same<T, platform::complex<double>>::value>::type> {
+  using type = T;
+};
+
+template <typename T>
+using PaddleCType = typename PaddleComplex<T>::type;
+template <typename T>
+using Real = typename math::Real<T>;
+
+static void SpiltBatchSquareMatrix(const Tensor& input,
+                                   std::vector<Tensor>* output) {
+  DDim input_dims = input.dims();
+  int last_dim = input_dims.size() - 1;
+  int n_dim = input_dims[last_dim];
+
+  DDim flattened_input_dims, flattened_output_dims;
+  if (input_dims.size() > 2) {
+    flattened_input_dims = flatten_to_3d(input_dims, last_dim - 1, last_dim);
+  } else {
+    flattened_input_dims = framework::make_ddim({1, n_dim, n_dim});
+  }
+
+  Tensor flattened_input;
+  flattened_input.ShareDataWith(input);
+  flattened_input.Resize(flattened_input_dims);
+  (*output) = flattened_input.Split(1, 0);
+}
+
+static void CheckLapackEigResult(const int info, const std::string& name) {
+  PADDLE_ENFORCE_LE(info, 0, platform::errors::PreconditionNotMet(
+                                 "The QR algorithm failed to compute all the "
+                                 "eigenvalues in function %s.",
+                                 name.c_str()));
+  PADDLE_ENFORCE_GE(
+      info, 0, platform::errors::InvalidArgument(
+                   "The %d-th argument has an illegal value in function %s.",
+                   -info, name.c_str()));
+}
+
+template <typename DeviceContext, typename T>
+static typename std::enable_if<std::is_floating_point<T>::value>::type
+LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
+              Tensor* output, Tensor* work, Tensor* rwork /*unused*/) {
+  Tensor a;  // will be overwritten when lapackEig exit
+  framework::TensorCopy(input, input.place(), &a);
+
+  Tensor w;
+  int64_t n_dim = input.dims()[1];
+  auto* w_data =
+      w.mutable_data<T>(framework::make_ddim({n_dim << 1}), ctx.GetPlace());
+
+  int64_t work_mem = work->memory_size();
+  int64_t required_work_mem = 3 * n_dim * sizeof(T);
+  PADDLE_ENFORCE_GE(
+      work_mem, 3 * n_dim * sizeof(T),
+      platform::errors::InvalidArgument(
+          "The memory size of the work tensor in LapackEigvals function "
+          "should be at least %" PRId64 " bytes, "
+          "but received work\'s memory size = %" PRId64 " bytes.",
+          required_work_mem, work_mem));
+
+  int info = 0;
+  math::lapackEig<T>('N', 'N', static_cast<int>(n_dim), a.template data<T>(),
+                     static_cast<int>(n_dim), w_data, NULL, 1, NULL, 1,
+                     work->template data<T>(),
+                     static_cast<int>(work_mem / sizeof(T)),
+                     static_cast<T*>(NULL), &info);
+
+  std::string name = "framework::platform::dynload::dgeev_";
+  if (input.type() == framework::proto::VarType::FP64) {
+    name = "framework::platform::dynload::sgeev_";
+  }
+  CheckLapackEigResult(info, name);
+
+  platform::ForRange<DeviceContext> for_range(
+      ctx.template device_context<DeviceContext>(), n_dim);
+  math::RealImagToComplexFunctor<PaddleCType<T>> functor(
+      w_data, w_data + n_dim, output->template data<PaddleCType<T>>(), n_dim);
+  for_range(functor);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<std::is_same<T, platform::complex<float>>::value ||
+                        std::is_same<T, platform::complex<double>>::value>::type
+LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
+              Tensor* output, Tensor* work, Tensor* rwork) {
+  Tensor a;  // will be overwritten when lapackEig exit
+  framework::TensorCopy(input, input.place(), &a);
+
+  int64_t work_mem = work->memory_size();
+  int64_t n_dim = input.dims()[1];
+  int64_t required_work_mem = 3 * n_dim * sizeof(T);
+  PADDLE_ENFORCE_GE(
+      work_mem, 3 * n_dim * sizeof(T),
+      platform::errors::InvalidArgument(
+          "The memory size of the work tensor in LapackEigvals function "
+          "should be at least %" PRId64 " bytes, "
+          "but received work\'s memory size = %" PRId64 " bytes.",
+          required_work_mem, work_mem));
+
+  int64_t rwork_mem = rwork->memory_size();
+  int64_t required_rwork_mem = (n_dim << 1) * sizeof(Real<T>);
+  PADDLE_ENFORCE_GE(
+      rwork_mem, required_rwork_mem,
+      platform::errors::InvalidArgument(
+          "The memory size of the rwork tensor in LapackEigvals function "
+          "should be at least %" PRId64 " bytes, "
+          "but received rwork\'s memory size = %" PRId64 " bytes.",
+          required_rwork_mem, rwork_mem));
+
+  int info = 0;
+  math::lapackEig<T, Real<T>>(
+      'N', 'N', static_cast<int>(n_dim), a.template data<T>(),
+      static_cast<int>(n_dim), output->template data<T>(), NULL, 1, NULL, 1,
+      work->template data<T>(), static_cast<int>(work_mem / sizeof(T)),
+      rwork->template data<Real<T>>(), &info);
+
+  std::string name = "framework::platform::dynload::cgeev_";
+  if (input.type() == framework::proto::VarType::COMPLEX64) {
+    name = "framework::platform::dynload::zgeev_";
+  }
+  CheckLapackEigResult(info, name);
+}
+
+template <typename DeviceContext, typename T>
+class EigvalsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("X");
+    Tensor* output = ctx.Output<Tensor>("Out");
+    output->mutable_data<PaddleCType<T>>(ctx.GetPlace());
+
+    std::vector<Tensor> input_matrices;
+    SpiltBatchSquareMatrix(*input, /*->*/ &input_matrices);
+
+    int64_t n_dim = input_matrices[0].dims()[1];
+    int64_t n_batch = input_matrices.size();
+    DDim output_dims = output->dims();
+    output->Resize(framework::make_ddim({n_batch, n_dim}));
+    std::vector<Tensor> output_vectors = output->Split(1, 0);
+
+    // query workspace size
+    T qwork;
+    int info;
+    math::lapackEig<T, Real<T>>('N', 'N', static_cast<int>(n_dim),
+                                input_matrices[0].template data<T>(),
+                                static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1,
+                                &qwork, -1, static_cast<Real<T>*>(NULL), &info);
+    int64_t lwork = static_cast<int64_t>(qwork);
+
+    Tensor work, rwork;
+    try {
+      work.mutable_data<T>(framework::make_ddim({lwork}), ctx.GetPlace());
+    } catch (memory::allocation::BadAlloc&) {
+      LOG(WARNING) << "Failed to allocate Lapack workspace with the optimal "
+                   << "memory size = " << lwork * sizeof(T) << " bytes, "
+                   << "try reallocating a smaller workspace with the minimum "
+                   << "required size = " << 3 * n_dim * sizeof(T) << " bytes, "
+                   << "this may lead to bad performance.";
+      lwork = 3 * n_dim;
+      work.mutable_data<T>(framework::make_ddim({lwork}), ctx.GetPlace());
+    }
+    if (framework::IsComplexType(input->type())) {
+      rwork.mutable_data<Real<T>>(framework::make_ddim({n_dim << 1}),
+                                  ctx.GetPlace());
+    }
+
+    for (int64_t i = 0; i < n_batch; ++i) {
+      LapackEigvals<DeviceContext, T>(ctx, input_matrices[i],
+                                      &output_vectors[i], &work, &rwork);
+    }
+    output->Resize(output_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index 9657e1896e334..65505381db174 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -41,12 +41,16 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_max,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_max_grad,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
index 8ee8fe923a811..06269b12e8e20 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -39,14 +39,14 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
 template <typename T>
 struct MaxGradDx {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x > y);
+    return dout * static_cast<T>(x > y);
   }
 };
 
 template <typename T>
 struct MaxGradDy {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x <= y);
+    return dout * static_cast<T>(x <= y);
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 83aff3b55771e..e4074cc7d7d60 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -75,13 +75,13 @@ int GetVectorizedSizeForTensors(
 template <typename InT, typename OutT, int VecSize, typename Functor, int Arity,
           bool CallElementwiseAny = false>
 struct ElementwisePrimitiveCaller {
-  __device__ inline OutT operator()(Functor func, InT (*args)[VecSize],
+  __device__ inline void operator()(Functor func, InT (*args)[VecSize],
                                     OutT *result);
 };
 
 template <typename InT, typename OutT, int VecSize, typename Functor, int Arity>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
-  __device__ inline OutT operator()(Functor func, InT (*args)[VecSize],
+  __device__ inline void operator()(Functor func, InT (*args)[VecSize],
                                     OutT *result) {
     kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(result, args,
                                                                   func);
@@ -90,7 +90,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
 
 template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
-  __device__ inline OutT operator()(Functor func, InT (*args)[VecSize],
+  __device__ inline void operator()(Functor func, InT (*args)[VecSize],
                                     OutT *result) {
     kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(result, args[0],
                                                              func);
@@ -99,7 +99,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
 
 template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
-  __device__ inline OutT operator()(Functor func, InT (*args)[VecSize],
+  __device__ inline void operator()(Functor func, InT (*args)[VecSize],
                                     OutT *result) {
     kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(result, args[0],
                                                               args[1], func);
@@ -108,7 +108,8 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
 
 template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
-  __device__ inline OutT operator()(Functor func, InT **args, OutT *result) {
+  __device__ inline void operator()(Functor func, InT (*args)[VecSize],
+                                    OutT *result) {
     kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
         result, args[0], args[1], args[2], func);
   }
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 6cea4bfb990df..4f41ecf04cf43 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -43,10 +43,8 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
 
     auto tz = paddle::framework::vectorize<int64_t>(dout->dims());
     memory::data_type dout_type = framework::ToMKLDNNDataType(dout->type());
-    std::string key = platform::CreateKey(dev_ctx, tz, dout->format(),
-                                          dout->format(), dout_type);
-    platform::ReorderMKLDNNHandler handler(tz, dout->type(), dout_type, dev_ctx,
-                                           onednn_engine, key);
+    platform::ReorderMKLDNNHandler handler(tz, dout->type(), dout_type,
+                                           onednn_engine);
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     auto reorder_src_memory_p = handler.AcquireSrcMemory(
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
new file mode 100644
index 0000000000000..be8dad62c3c05
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
@@ -0,0 +1,132 @@
+
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
+
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto tz = framework::vectorize<int64_t>(dout->dims());
+    memory::data_type dout_type = framework::ToMKLDNNDataType(dout->type());
+    platform::ReorderMKLDNNHandler handler(tz, dout->type(), dout_type,
+                                           onednn_engine);
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    auto reorder_src_memory_p = handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+
+    if (dx) {
+      auto reorder_dst_memory_p =
+          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
+      auto reorder_p =
+          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+      platform::RecordEvent record_reorder("int_reorder",
+                                           platform::EventRole::kUniqueOp);
+
+      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+      astream.wait();
+
+      dx->set_layout(DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+    }
+
+    if (dy) {
+      // Direct copy
+      if (dout->dims() == dy->dims()) {
+        auto reorder_dst_memory_p =
+            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
+
+        dnnl::primitive_attr reorder_attr;
+        std::vector<float> scales = {-1};
+        reorder_attr.set_output_scales(0, scales);
+        auto reorder_p = std::make_shared<dnnl::reorder>(
+            *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
+        platform::RecordEvent record_reorder("int_reorder",
+                                             platform::EventRole::kUniqueOp);
+        reorder_p->execute(astream, *reorder_src_memory_p,
+                           *reorder_dst_memory_p);
+        astream.wait();
+
+        dy->set_layout(DataLayout::kMKLDNN);
+        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+      } else {
+        // Broadcasting
+
+        dnnl::post_ops po;
+        po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
+        dnnl::primitive_attr attr;
+        attr.set_post_ops(po);
+
+        platform::ReductionMKLDNNHandler<T> handler_sum(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
+            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr);
+
+        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
+        auto reduction_p = handler_sum.AcquireForwardPrimitive();
+
+        reduction_p->execute(astream, {
+                                          {DNNL_ARG_SRC, *reorder_src_memory_p},
+                                          {DNNL_ARG_DST, *dy_memory_p},
+                                      });
+        astream.wait();
+
+        dy->set_layout(DataLayout::kMKLDNN);
+        dy->set_format(
+            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
+                paddle::framework::vectorize<int64_t>(dy->dims()))));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(
+    elementwise_sub, MKLDNN, paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
+                             dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_sub>)
+
+REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseSubMKLDNNGradKernel<paddle::platform::bfloat16>,
+                   ops::EltwiseSubMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index c727c657ed79d..2d340829332c8 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -93,7 +94,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     fill_zeros_like2,
@@ -101,4 +106,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index 1831635def79b..4cb0887c1f326 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -25,7 +26,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     fill_zeros_like2,
@@ -35,4 +40,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc
index d062243acf39a..5e6d263f1907b 100644
--- a/paddle/fluid/operators/flip_op.cc
+++ b/paddle/fluid/operators/flip_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -145,6 +146,7 @@ class FlipOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType,
                   ops::FlipOpGradMaker<paddle::framework::OpDesc>,
                   ops::FlipOpGradMaker<paddle::imperative::OpBase>);
@@ -153,7 +155,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::FlipKernel<paddle::platform::CPUDeviceContext, double>,
     ops::FlipKernel<paddle::platform::CPUDeviceContext, int32_t>,
     ops::FlipKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::FlipKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::FlipKernel<paddle::platform::CPUDeviceContext, plat::complex<float>>,
+    ops::FlipKernel<paddle::platform::CPUDeviceContext, plat::complex<double>>);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(flip)
diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu
index 581a994ba84b5..26b3d11bc6c7b 100644
--- a/paddle/fluid/operators/flip_op.cu
+++ b/paddle/fluid/operators/flip_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -163,4 +164,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FlipKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::FlipKernel<paddle::platform::CUDADeviceContext, int>,
     ops::FlipKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, bool>);
+    ops::FlipKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::FlipKernel<paddle::platform::CUDADeviceContext, plat::complex<float>>,
+    ops::FlipKernel<paddle::platform::CUDADeviceContext,
+                    plat::complex<double>>);
diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc
new file mode 100644
index 0000000000000..7568941e980d1
--- /dev/null
+++ b/paddle/fluid/operators/frame_op.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/frame_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FrameOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "frame");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "frame");
+
+    const int frame_length = ctx->Attrs().Get<int>("frame_length");
+    const int hop_length = ctx->Attrs().Get<int>("hop_length");
+    const int axis = ctx->Attrs().Get<int>("axis");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const int x_rank = x_dims.size();
+
+    PADDLE_ENFORCE_GE(
+        x_rank, 1, platform::errors::InvalidArgument(
+                       "Input(X) of FrameOp should be a tensor which contains "
+                       "at least 1 dimension, but got rank %s.",
+                       x_rank));
+    PADDLE_ENFORCE_GT(hop_length, 0,
+                      platform::errors::InvalidArgument(
+                          "Attribute(hop_length) of FrameOp should be greater "
+                          "than 0, but got %s.",
+                          hop_length));
+    PADDLE_ENFORCE_EQ(
+        (axis == 0 || axis == -1), true,
+        platform::errors::InvalidArgument(
+            "Attribute(axis) of FrameOp should 0 or -1, but got %s.", axis));
+
+    std::vector<int64_t> output_shape;
+    int seq_length;
+    int n_frames;
+
+    int start_axis;
+    int end_axis;
+
+    if (axis == 0) {
+      seq_length = x_dims[0];
+      start_axis = 1;
+      end_axis = x_rank - 1;
+    } else {
+      seq_length = x_dims[x_rank - 1];
+      start_axis = 0;
+      end_axis = x_rank - 2;
+    }
+
+    PADDLE_ENFORCE_LE(frame_length, seq_length,
+                      platform::errors::InvalidArgument(
+                          "Attribute(frame_length) of FrameOp should be less "
+                          "equal than sequence length, but got (%s) > (%s).",
+                          frame_length, seq_length));
+
+    // It won't go into for loop when x_rank == 1U.
+    for (int i = start_axis; i <= end_axis; i++) {
+      output_shape.push_back(x_dims[i]);
+    }
+
+    n_frames = 1 + (seq_length - frame_length) / hop_length;
+
+    if (axis == 0) {
+      // (n_frames, frame_length, ...)
+      output_shape.insert(output_shape.begin(), frame_length);
+      output_shape.insert(output_shape.begin(), n_frames);
+    } else {
+      // (..., frame_length, n_frames)
+      output_shape.push_back(frame_length);
+      output_shape.push_back(n_frames);
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(in_dtype, ctx.GetPlace());
+  }
+};
+
+class FrameOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of frame op.");
+    AddOutput("Out", "(Tensor), The output tensor of frame op.");
+    AddAttr<int>(
+        "frame_length",
+        "Length of the frame and `0 < frame_length <= x.shape[axis]`.");
+    AddAttr<int>("hop_length",
+                 "Number of steps to advance between adjacent frames and "
+                 "`0 < hop_length`.");
+    AddAttr<int>("axis",
+                 "Specify the axis to operate on the input Tensors. Its value "
+                 "should be 0(the first dimension) or -1(the last dimension).")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+      Slice the N-dimensional (where N >= 1) input into (overlapping) frames.
+    )DOC");
+  }
+};
+
+class FrameOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "frame_grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "frame_grad");
+    const auto x_dims = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(in_dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FrameOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("frame_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(frame, ops::FrameOp, ops::FrameOpMaker,
+                  ops::FrameOpGradMaker<paddle::framework::OpDesc>,
+                  ops::FrameOpGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(frame_grad, ops::FrameOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    frame, ops::FrameKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FrameKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FrameKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FrameKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FrameKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::complex<float>>,
+    ops::FrameKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::complex<double>>);
+
+REGISTER_OP_CPU_KERNEL(
+    frame_grad, ops::FrameGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FrameGradKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::FrameGradKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/frame_op.cu b/paddle/fluid/operators/frame_op.cu
new file mode 100644
index 0000000000000..797e0aa0111d8
--- /dev/null
+++ b/paddle/fluid/operators/frame_op.cu
@@ -0,0 +1,41 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/frame_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    frame, ops::FrameKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FrameKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FrameKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FrameKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FrameKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::float16>,
+    ops::FrameKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::complex<float>>,
+    ops::FrameKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::complex<double>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    frame_grad, ops::FrameGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FrameGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FrameGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FrameGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FrameGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::float16>,
+    ops::FrameGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::FrameGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/frame_op.h b/paddle/fluid/operators/frame_op.h
new file mode 100644
index 0000000000000..482c6411812b6
--- /dev/null
+++ b/paddle/fluid/operators/frame_op.h
@@ -0,0 +1,341 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/seq2col.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+struct FrameFunctor {
+  void operator()(const DeviceContext& dev_ctx, const Tensor* input,
+                  Tensor* output, size_t seq_length, size_t frame_length,
+                  size_t n_frames, size_t hop_length,
+                  bool is_grad = false) const {
+    auto numel = output->numel();
+    const auto* input_data = input->data<T>();
+    auto* output_data = output->data<T>();
+
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    if (!is_grad) {
+      math::Seq2ColFunctor<T> functor(input_data, output_data, seq_length,
+                                      frame_length, n_frames, hop_length);
+      for_range(functor);
+    } else {
+      math::Col2SeqFunctor<T> functor(input_data, output_data, seq_length,
+                                      frame_length, n_frames, hop_length);
+      for_range(functor);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FrameKernel : public framework::OpKernel<T> {
+ public:
+  /*
+    Frame kernel slices frames from input sequences. The main steps as follows:
+
+      - Case 1 - input dims == 1:
+        - axis is -1: Call a FrameFunctor to compute directly.
+        - axis is  0: Transpose output firstly, and then it falls into
+                      case axis is -1. Finally, it restores the dims of
+                      output tensor.
+
+      - Case 2 - input dims == 2:
+        - axis is -1: Call a FrameFunctor to compute directly.
+        - axis is  0: Transpose both input and output firstly, and then it falls
+                      into case axis is -1. Finally, it restores the dims of
+                      output tensor.
+
+      - Case 3 - input dims > 2:
+        Flatten the input and output to 2D and 3D respectively so that it
+        falls into Case 2. Finally, it restores the dims of output tensor.
+  */
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    const size_t x_rank = x->dims().size();
+    const size_t out_rank = out->dims().size();
+
+    const int frame_length = ctx.Attr<int>("frame_length");
+    const int hop_length = ctx.Attr<int>("hop_length");
+    const int axis = ctx.Attr<int>("axis");
+    const int n_frames =
+        (axis == 0) ? out->dims()[0] : out->dims()[out_rank - 1];
+    const int seq_length = (axis == 0) ? x->dims()[0] : x->dims()[x_rank - 1];
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    // When the number of input dims is larger than 2, it needs to copy
+    // from x to resize input into 2d and output into 3d. Morevoer, output
+    // dims will be restored at the last step.
+    Tensor x_(x->type());
+    x_ = *x;
+
+    framework::DDim preserved_dims;
+    if (x_rank > 2) {
+      // Save dims used to flatten both input and output tensors and restore
+      // output tensor.
+      framework::DDim x_resized_dims;
+      framework::DDim out_resized_dims;
+      if (axis == 0) {
+        preserved_dims = framework::slice_ddim(x_.dims(), 1, x_rank);
+        x_resized_dims = {seq_length, framework::product(preserved_dims)};
+        out_resized_dims = {n_frames, frame_length,
+                            framework::product(preserved_dims)};
+      } else {
+        preserved_dims = framework::slice_ddim(x_.dims(), 0, x_rank - 1);
+        x_resized_dims = {framework::product(preserved_dims), seq_length};
+        out_resized_dims = {framework::product(preserved_dims), frame_length,
+                            n_frames};
+      }
+      x_.Resize(x_resized_dims);
+      out->Resize(out_resized_dims);
+    }
+
+    Tensor trans_x(x_.type());
+    Tensor trans_out(out->type());
+
+    // Transpose input and output in case that axis is 0.
+    if (axis == 0) {
+      if (x_rank == 1U) {
+        trans_x = x_;
+
+        std::vector<int> perm_out{1, 0};
+        auto out_dims_vec = framework::vectorize(out->dims());
+        for (int i = 0; i < out->dims().size(); ++i) {
+          out_dims_vec[i] = out->dims()[perm_out[i]];
+        }
+        trans_out.Resize(framework::make_ddim(out_dims_vec));
+        trans_out.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_out.size(), dev_ctx, *out,
+                                       &trans_out, perm_out);
+      } else {
+        std::vector<int> perm_x{1, 0};
+        auto x_dims_vec = framework::vectorize(x_.dims());
+        for (int i = 0; i < x_.dims().size(); ++i) {
+          x_dims_vec[i] = x_.dims()[perm_x[i]];
+        }
+        trans_x.Resize(framework::make_ddim(x_dims_vec));
+        trans_x.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_x.size(), dev_ctx, x_, &trans_x,
+                                       perm_x);
+
+        std::vector<int> perm_out{2, 1, 0};
+        auto out_dims_vec = framework::vectorize(out->dims());
+        for (int i = 0; i < out->dims().size(); ++i) {
+          out_dims_vec[i] = out->dims()[perm_out[i]];
+        }
+        trans_out.Resize(framework::make_ddim(out_dims_vec));
+        trans_out.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_out.size(), dev_ctx, *out,
+                                       &trans_out, perm_out);
+      }
+    } else {
+      trans_x = x_;
+      trans_out = *out;
+    }
+
+    FrameFunctor<DeviceContext, T>()(dev_ctx, &trans_x, &trans_out, seq_length,
+                                     frame_length, n_frames, hop_length,
+                                     /*is_grad*/ false);
+
+    // Transpose output in case axis is 0.
+    if (axis == 0) {
+      if (x_rank == 1U) {
+        std::vector<int> perm_out{1, 0};
+        TransCompute<DeviceContext, T>(perm_out.size(), dev_ctx, trans_out, out,
+                                       perm_out);
+      } else {
+        std::vector<int> perm_out{2, 1, 0};
+        TransCompute<DeviceContext, T>(perm_out.size(), dev_ctx, trans_out, out,
+                                       perm_out);
+      }
+    }
+
+    // Restore output dims when the number of dims is larger than 2.
+    if (x_rank > 2) {
+      std::vector<int64_t> restored_out_shape;
+      for (int i = 0; i < preserved_dims.size(); i++) {
+        restored_out_shape.push_back(preserved_dims[i]);
+      }
+
+      if (axis == 0) {
+        // (n_frames, frame_length, ...)
+        restored_out_shape.insert(restored_out_shape.begin(), frame_length);
+        restored_out_shape.insert(restored_out_shape.begin(), n_frames);
+      } else {
+        // (..., frame_length, n_frames)
+        restored_out_shape.push_back(frame_length);
+        restored_out_shape.push_back(n_frames);
+      }
+
+      out->Resize(framework::make_ddim(restored_out_shape));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FrameGradKernel : public framework::OpKernel<T> {
+ public:
+  /*
+    Frame gradient kernel accumulate gradient `d_x` from `d_out`. The
+    main steps as follows:
+
+      - Case 1 - d_x dims == 1:
+        - axis is -1: Call a FrameFunctor to compute directly. Notes that
+                      `is_grad` is set to true to select gradient data functor.
+        - axis is  0: Transpose `d_out` firstly, and then it falls into
+                      case axis is -1.
+
+      - Case 2 - d_x dims == 2:
+        - axis is -1: Call a FrameFunctor to compute directly.
+        - axis is  0: Transpose both `d_x` and `d_out` firstly, and then it
+                      falls into case axis is -1. Finally, it restores the
+                      dims of `d_x`.
+
+      - Case 3 - d_x dims > 2:
+        Flatten the `d_x` and `d_out` to 2D and 3D respectively so that it
+        falls into Case 2. Finally, it restores the dims of `d_x` tensor.
+  */
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    d_x->mutable_data<T>(ctx.GetPlace());
+    const size_t d_out_rank = d_out->dims().size();
+    const size_t d_x_rank = d_x->dims().size();
+
+    const int frame_length = ctx.Attr<int>("frame_length");
+    const int hop_length = ctx.Attr<int>("hop_length");
+    const int axis = ctx.Attr<int>("axis");
+    const int n_frames =
+        (axis == 0) ? d_out->dims()[0] : d_out->dims()[d_out_rank - 1];
+    const int seq_length =
+        (axis == 0) ? d_x->dims()[0] : d_x->dims()[d_x_rank - 1];
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    Tensor d_out_(d_out->type());
+    d_out_ = *d_out;
+
+    framework::DDim preserved_dims;
+    if (d_x_rank > 2) {
+      // Save dims used to flatten both input and output tensors and restore
+      // output tensor.
+      framework::DDim d_x_resized_dims;
+      framework::DDim d_out_resized_dims;
+      if (axis == 0) {
+        preserved_dims = framework::slice_ddim(d_x->dims(), 1, d_x_rank);
+        d_x_resized_dims = {seq_length, framework::product(preserved_dims)};
+        d_out_resized_dims = {n_frames, frame_length,
+                              framework::product(preserved_dims)};
+      } else {
+        preserved_dims = framework::slice_ddim(d_x->dims(), 0, d_x_rank - 1);
+        d_x_resized_dims = {framework::product(preserved_dims), seq_length};
+        d_out_resized_dims = {framework::product(preserved_dims), frame_length,
+                              n_frames};
+      }
+      d_x->Resize(d_x_resized_dims);
+      d_out_.Resize(d_out_resized_dims);
+    }
+
+    Tensor trans_d_x(d_x->type());
+    Tensor trans_d_out(d_out_.type());
+
+    // Transpose input and output in case that axis is 0.
+    if (axis == 0) {
+      if (d_x_rank == 1U) {
+        trans_d_x = *d_x;
+
+        std::vector<int> perm_d_out{1, 0};
+        auto d_out_dims_vec = framework::vectorize(d_out_.dims());
+        for (int i = 0; i < d_out_.dims().size(); ++i) {
+          d_out_dims_vec[i] = d_out_.dims()[perm_d_out[i]];
+        }
+        trans_d_out.Resize(framework::make_ddim(d_out_dims_vec));
+        trans_d_out.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_d_out.size(), dev_ctx, d_out_,
+                                       &trans_d_out, perm_d_out);
+      } else {
+        std::vector<int> perm_d_x{1, 0};
+        auto d_x_dims_vec = framework::vectorize(d_x->dims());
+        for (int i = 0; i < d_x->dims().size(); ++i) {
+          d_x_dims_vec[i] = d_x->dims()[perm_d_x[i]];
+        }
+        trans_d_x.Resize(framework::make_ddim(d_x_dims_vec));
+        trans_d_x.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_d_x.size(), dev_ctx, *d_x,
+                                       &trans_d_x, perm_d_x);
+
+        std::vector<int> perm_d_out{2, 1, 0};
+        auto d_out_dims_vec = framework::vectorize(d_out_.dims());
+        for (int i = 0; i < d_out_.dims().size(); ++i) {
+          d_out_dims_vec[i] = d_out_.dims()[perm_d_out[i]];
+        }
+        trans_d_out.Resize(framework::make_ddim(d_out_dims_vec));
+        trans_d_out.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_d_out.size(), dev_ctx, d_out_,
+                                       &trans_d_out, perm_d_out);
+      }
+    } else {
+      trans_d_x = *d_x;
+      trans_d_out = d_out_;
+    }
+
+    FrameFunctor<DeviceContext, T>()(dev_ctx, &trans_d_out, &trans_d_x,
+                                     seq_length, frame_length, n_frames,
+                                     hop_length,
+                                     /*is_grad*/ true);
+
+    // Transpose output in case axis is 0.
+    if (axis == 0 && d_x_rank > 1U) {
+      std::vector<int> perm_d_x{1, 0};
+      TransCompute<DeviceContext, T>(perm_d_x.size(), dev_ctx, trans_d_x, d_x,
+                                     perm_d_x);
+    }
+
+    // Restore output dims when the number of dims is larger than 2.
+    if (d_x_rank > 2) {
+      std::vector<int64_t> restored_d_x_shape;
+      for (int i = 0; i < preserved_dims.size(); i++) {
+        restored_d_x_shape.push_back(preserved_dims[i]);
+      }
+
+      if (axis == 0) {
+        // (seq_length, ...)
+        restored_d_x_shape.insert(restored_d_x_shape.begin(), seq_length);
+      } else {
+        // (..., seq_length)
+        restored_d_x_shape.push_back(seq_length);
+      }
+
+      d_x->Resize(framework::make_ddim(restored_d_x_shape));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 0a12735acf2a0..599be6912b760 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -74,7 +74,11 @@ if (WITH_GPU OR WITH_ROCM)
     # fused_dropout
     # only support CUDA
     if(NOT WITH_ROCM)
-        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op device_context generator memory)
-        nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op device_context generator memory)
+        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
+        nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
+        nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
+    endif()
+    if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
+        cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
     endif()
 endif()
diff --git a/paddle/fluid/operators/fused/attention_layer_norm.h b/paddle/fluid/operators/fused/attention_layer_norm.h
index d234a0f08531f..43491a9faf18c 100644
--- a/paddle/fluid/operators/fused/attention_layer_norm.h
+++ b/paddle/fluid/operators/fused/attention_layer_norm.h
@@ -50,7 +50,7 @@ class AttnLayerNorm {
     }
   }
 
-  void ComputeBackward(const T* x_data， const T* y_data,
+  void ComputeBackward(const T* x_data, const T* d_y_data,
                        const LayerNormParamType<T>* scale_data,
                        const LayerNormParamType<T>* mean_data,
                        const LayerNormParamType<T>* var_data, T* d_x_data,
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index a8bd35a1b7309..fa3eb19b29995 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -34,6 +34,7 @@ namespace cub = hipcub;
 #define LAUNCH_BOUNDS(BlockDim)
 #endif
 
+#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
@@ -51,11 +52,6 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using ReduceParamType = typename CudnnDataType<T>::BatchNormParamType;
 
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
-};
-
 template <typename InT, typename OutT, int ShapeSize, int VecSize,
           int DATA_PER_THREAD, typename Functor>
 __global__ void BroadcastKernelBinary(
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
new file mode 100644
index 0000000000000..a2001d0a81492
--- /dev/null
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/fused/attn_bias_add.cu.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+// support gemm-nt and gemm-nn, which is used in fused_attention_op.
+template <typename T>
+class AttnMatMul {
+ public:
+  // (m, n, k) = bsz_seq, output_size, input_size
+  AttnMatMul(const platform::CUDADeviceContext& dev_ctx, bool transA,
+             bool transB, int bsz_seq, int output_size, int input_size,
+             bool compute_bias)
+      : dev_ctx_(dev_ctx),
+        transA_(transA),
+        transB_(transB),
+        bsz_seq_(bsz_seq),
+        output_size_(output_size),
+        input_size_(input_size),
+        compute_bias_(compute_bias) {}
+
+  ~AttnMatMul() {}
+
+  void ComputeForward(const T* weight_data, const T* input_data,
+                      const T* bias_data, T* output_data, T* bias_out_data) {
+    // Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
+    // here: (transa, transb): nt, input * weight.
+    CBLAS_TRANSPOSE transA = CblasNoTrans;
+    CBLAS_TRANSPOSE transB = CblasNoTrans;
+    if (transA_) {
+      transA = CblasTrans;
+    }
+    if (transB_) {
+      transB = CblasTrans;
+    }
+    T alpha = static_cast<T>(1.0);
+    T beta = static_cast<T>(0.0);
+
+    // here: (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha,
+              input_data, weight_data, beta, output_data);
+    if (compute_bias_) {
+      // compute output + bias
+      LaunchBiasAddFwKernel(dev_ctx_, bsz_seq_, output_size_, output_data,
+                            bias_data, bias_out_data);
+    }
+  }
+
+  void ComputeBackward(const T* input, const T* weight, const T* d_output,
+                       T* d_input, T* d_weight, T* d_bias) {
+    T alpha = static_cast<T>(1.0);
+    T beta = static_cast<T>(0.0);
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+
+    CBLAS_TRANSPOSE dB_transA = CblasNoTrans;
+    CBLAS_TRANSPOSE dB_transB = CblasNoTrans;
+    CBLAS_TRANSPOSE dA_transA = CblasNoTrans;
+    CBLAS_TRANSPOSE dA_transB = CblasNoTrans;
+    int dB_m = 1;
+    int dB_n = 1;
+    int dB_k = 1;
+    int dA_m = 1;
+    int dA_n = 1;
+    int dA_k = 1;
+
+    T* dB_input_1_ptr = nullptr;
+    T* dB_input_2_ptr = nullptr;
+    T* dB_output_ptr = d_weight;
+
+    T* dA_input_1_ptr = nullptr;
+    T* dA_input_2_ptr = nullptr;
+    T* dA_output_ptr = d_input;
+
+    if (!transA_) {
+      // fw: gemm-nt
+      if (transB_) {
+        // bw: gemm-tn, dB = (dC)^t * A
+        dB_transA = CblasTrans;
+        dB_transB = CblasNoTrans;
+        dB_m = output_size_;
+        dB_n = input_size_;
+        dB_k = bsz_seq_;
+
+        // bw: gemm-nn, dA = dC * B
+        dA_transA = CblasNoTrans;
+        dA_transB = CblasNoTrans;
+        dA_m = bsz_seq_;
+        dA_n = input_size_;
+        dA_k = output_size_;
+
+        blas.GEMM(dB_transA, dB_transB, dB_m, dB_n, dB_k, alpha, d_output,
+                  input, beta, dB_output_ptr);
+        blas.GEMM(dA_transA, dA_transB, dA_m, dA_n, dA_k, alpha, d_output,
+                  weight, beta, dA_output_ptr);
+      } else {  // fw: gemm-nn
+        // bw: gemm-tn, dB = A^t * dC
+        dB_transA = CblasTrans;
+        dB_transB = CblasNoTrans;
+        dB_m = input_size_;
+        dB_n = output_size_;
+        dB_k = bsz_seq_;
+
+        // bw: gemm-nt, dA = dC * B^t
+        dA_transA = CblasNoTrans;
+        dA_transB = CblasTrans;
+        dA_m = bsz_seq_;
+        dA_n = input_size_;
+        dA_k = output_size_;
+
+        blas.GEMM(dB_transA, dB_transB, dB_m, dB_n, dB_k, alpha, input,
+                  d_output, beta, dB_output_ptr);
+        blas.GEMM(dA_transA, dA_transB, dA_m, dA_n, dA_k, alpha, d_output,
+                  weight, beta, dA_output_ptr);
+      }
+    } else if (transB_) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "AttnMatMul wrapper do not support (transA=T, transB=T)"
+          "parameters."));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "AttnMatMul wrapper do not support (transA=T, transB=N)"
+          "parameters."));
+    }
+    if (compute_bias_) {
+      LaunchBiasAddBwKernel(dev_ctx_, bsz_seq_, output_size_, d_output, d_bias);
+    }
+  }
+
+ private:
+  const platform::CUDADeviceContext& dev_ctx_;
+
+  bool transA_;
+  bool transB_;
+
+  int bsz_seq_;
+  int output_size_;
+  int input_size_;
+
+  int compute_bias_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
new file mode 100644
index 0000000000000..4434681e60b3b
--- /dev/null
+++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
@@ -0,0 +1,162 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+namespace dynload = platform::dynload;
+
+#if CUDNN_VERSION >= 8000
+
+// A wrapper for cuDNN fused_op API.
+class CudnnFusionOp {
+ public:
+  explicit CudnnFusionOp(cudnnFusedOps_t op_id) : plan_created_(false) {
+    // New 'fused op' descriptor creation
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnCreateFusedOpsConstParamPack(&op_const_params_, op_id));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack(
+        &op_variant_params_, op_id));
+  }
+
+  ~CudnnFusionOp() {
+    // New 'fused op' descriptor destruction
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
+  }
+
+  // Execute fused op
+  void Execute(cudnnHandle_t cudnn_handle) {
+    PADDLE_ENFORCE_EQ(
+        plan_created_, true,
+        platform::errors::Fatal(
+            "CudnnFusionOp exec requested without a valid 'plan', need: "
+            "<set const params>, GetWorkspaceSizeBytes(), Execute()."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnFusedOpsExecute(cudnn_handle, op_, op_variant_params_));
+  }
+
+  // Set const param pack attribute given a descriptor.
+  template <typename T>
+  void SetOpConstParamDesc(cudnnFusedOpsConstParamLabel_t param_label,
+                           T *param_ptr) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnSetFusedOpsConstParamPackAttribute(
+            op_const_params_, param_label, param_ptr));
+    plan_created_ = false;
+  }
+
+  // Set multiple const param pack attribute given a descriptor.
+  template <typename T>
+  void SetOpConstParamDesc(
+      const std::vector<cudnnFusedOpsConstParamLabel_t> &param_labels,
+      T *param_ptr) {
+    for (auto param_label : param_labels) {
+      SetOpConstParamDesc(param_label, param_ptr);
+    }
+  }
+
+  // Set const param pack attribute given a value of param.
+  template <typename T>
+  void SetOpConstParamAttr(cudnnFusedOpsConstParamLabel_t param_label,
+                           T param) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnSetFusedOpsConstParamPackAttribute(op_const_params_,
+                                                         param_label, &param));
+    plan_created_ = false;
+  }
+
+  // Set multiple const param pack attribute given a value of param.
+  template <typename T>
+  void SetOpConstParamAttr(
+      const std::vector<cudnnFusedOpsConstParamLabel_t> &param_labels,
+      T param) {
+    for (auto param_label : param_labels) {
+      SetOpConstParamAttr(param_label, param);
+    }
+  }
+
+  // Set a variant param pack attribute given a reference to a param.
+  template <typename T>
+  void SetOpVariantParamAttrPtr(cudnnFusedOpsVariantParamLabel_t param_label,
+                                T *param_ptr) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnSetFusedOpsVariantParamPackAttribute(
+            op_variant_params_, param_label, param_ptr));
+  }
+
+  // Set multiple const param pack attributes given a reference to a param.
+  template <typename T>
+  void SetOpVariantParamAttrPtr(
+      const std::vector<cudnnFusedOpsVariantParamLabel_t> &param_labels,
+      const T *param_ptr) {
+    for (auto param_label : param_labels) {
+      SetOpVariantParamAttrPtr(param_label, param_ptr);
+    }
+  }
+
+  // Get the workspace, which is required before Execute().
+  size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) {
+    size_t workspace_bytes = 0U;
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
+        cudnn_handle, op_, op_const_params_, &workspace_bytes));
+    plan_created_ = true;
+    return workspace_bytes;
+  }
+
+ private:
+  bool plan_created_;
+
+  cudnnFusedOpsPlan_t op_;
+  cudnnFusedOpsConstParamPack_t op_const_params_;
+  cudnnFusedOpsVariantParamPack_t op_variant_params_;
+};
+
+static inline std::vector<int> GetStrides(const std::vector<int> &shape) {
+  if (shape.size() < 1) {
+    return {};
+  }
+  int dim = static_cast<int>(shape.size());
+  std::vector<int> pro_shape(shape);
+  std::vector<int> strides(dim);
+  int temp = pro_shape[1];
+  pro_shape.erase(pro_shape.begin() + 1);
+  pro_shape.push_back(temp);
+  strides.back() = 1;
+  for (int i = dim - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * pro_shape[i + 1];
+  }
+  strides.pop_back();
+  strides.insert(strides.begin() + 1, 1);
+  return strides;
+}
+
+static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
+
+#endif  // CUDNN_VERSION >= 8000
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
new file mode 100644
index 0000000000000..1ead78b8b64e1
--- /dev/null
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+namespace dynload = platform::dynload;
+
+#if CUDNN_VERSION >= 8000
+template <typename T>
+class CudnnNormConvolutionOp {
+ public:
+  CudnnNormConvolutionOp()
+      : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS) {}
+  ~CudnnNormConvolutionOp() {}
+
+  void Init(const platform::CUDADeviceContext &ctx,
+            const std::vector<int> &input_shape,
+            const std::vector<int> &filter_shape,
+            const std::vector<int> &output_shape, const int &pad,
+            const int &stride, const int &dilate, const int &group) {
+    cudnn_fwd_compute_type_ = platform::CudnnDataType<float>::type;
+    dtype_ = platform::CudnnDataType<T>::type;
+    format_ = CUDNN_TENSOR_NHWC;
+
+    InitDescriptors(ctx, input_shape, filter_shape, output_shape, pad, stride,
+                    dilate, group);
+    GetWorkspaceSize(ctx);
+  }
+
+  void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr,
+               T *filter_ptr, T *output_ptr, float *sum_ptr,
+               float *sum_of_squares_ptr) {
+    auto handle = ctx.cudnn_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
+    // Set variant_param
+    // input ptr
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(
+        CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_);
+    // output ptr
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr);
+    workspace_handle.RunFunc(
+        [&](void *workspace_ptr) {
+          // workspace ptr
+          fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
+          // fused op execute
+          fwd_op_.Execute(handle);
+        },
+        fwd_workspace_byte_);
+  }
+
+  // TBD
+  void Backward(const platform::CUDADeviceContext &ctx) {}
+
+ private:
+  void InitDescriptors(const platform::CUDADeviceContext &ctx,
+                       const std::vector<int> &input_shape,
+                       const std::vector<int> &filter_shape,
+                       const std::vector<int> &output_shape, const int &pad,
+                       const int &stride, const int &dilate, const int &group) {
+    // Set constant_param
+    fwd_op_.SetOpConstParamAttr(
+        {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER,
+         CUDNN_PARAM_YDATA_PLACEHOLDER},
+        CUDNN_PTR_16B_ALIGNED);
+    fwd_op_.SetOpConstParamAttr(
+        {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER},
+        CUDNN_PTR_16B_ALIGNED);
+
+    std::vector<int> pad_vec = {pad, pad};
+    std::vector<int> stride_vec = {stride, stride};
+    std::vector<int> dilate_vec = {dilate, dilate};
+    int output_channel = filter_shape[0];
+    std::vector<int> stats_shape = {1, 1, 1, output_channel};
+
+    // set conv desc
+    conv_desc_.set(dtype_, pad_vec, stride_vec, dilate_vec, false, group);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, conv_desc_.desc());
+
+    // set input desc
+    in_desc_.set(input_shape, format_, dtype_);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, in_desc_.desc());
+
+    // set filter desc
+    filter_desc_.set(filter_shape, format_, dtype_, group);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_WDESC, filter_desc_.desc());
+
+    // set output desc
+    out_desc_.set(output_shape, format_, dtype_);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, out_desc_.desc());
+
+    // set output_stats desc
+    out_stats_desc_.set(stats_shape, format_, cudnn_fwd_compute_type_);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC,
+                                out_stats_desc_.desc());
+
+    fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, CUDNN_BATCHNORM_SPATIAL);
+  }
+
+  void GetWorkspaceSize(const platform::CUDADeviceContext &ctx) {
+    auto handle = ctx.cudnn_handle();
+    fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle);
+  }
+
+  size_t fwd_workspace_byte_ = 0;
+
+  cudnnDataType_t dtype_;
+  cudnnDataType_t cudnn_fwd_compute_type_;
+  platform::TensorDescriptor in_desc_;
+  platform::FilterDescriptor filter_desc_;
+  platform::TensorDescriptor out_desc_;
+  platform::TensorDescriptor out_stats_desc_;
+  platform::ConvolutionDescriptor conv_desc_;
+  cudnnTensorFormat_t format_;
+
+  CudnnFusionOp fwd_op_;
+};
+#endif
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
new file mode 100644
index 0000000000000..125ed85642292
--- /dev/null
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -0,0 +1,262 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <random>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace op = paddle::operators;
+using Tensor = paddle::framework::Tensor;
+
+USE_OP(conv2d);
+USE_OP_DEVICE_KERNEL(conv2d, CUDNN);
+
+// get paddle conv2d op results as baseline
+template <typename T>
+void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y,
+                          const platform::CUDADeviceContext &ctx) {
+  framework::Scope scope;
+  auto var_x = scope.Var("Input");
+  auto tensor_x = var_x->GetMutable<framework::LoDTensor>();
+  auto var_w = scope.Var("Filter");
+  auto tensor_w = var_w->GetMutable<framework::LoDTensor>();
+  auto var_y = scope.Var("Output");
+  auto tensor_y = var_y->GetMutable<framework::LoDTensor>();
+
+  auto place = ctx.GetPlace();
+  TensorCopySync(x, place, tensor_x);
+  TensorCopySync(w, place, tensor_w);
+
+  framework::AttributeMap attrs;
+  bool use_cudnn = true;
+  std::string data_format = "NHWC";
+  std::string padding_algorithm = "SAME";
+  attrs.insert({"use_cudnn", use_cudnn});
+  attrs.insert({"data_format", data_format});
+  attrs.insert({"padding_algorithm", padding_algorithm});
+
+  auto op = framework::OpRegistry::CreateOp(
+      "conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}},
+      {{"Output", {"Output"}}}, attrs);
+  op->Run(scope, ctx.GetPlace());
+
+  TensorCopySync(*tensor_y, place, y);
+  ctx.Wait();
+}
+
+template <typename T>
+class TestCudnnNormConvOpForward {
+ public:
+  TestCudnnNormConvOpForward() {
+    batch_size_ = 2;
+    height_ = 8;
+    width_ = 8;
+    input_channels_ = 8;
+    output_channels_ = 32;
+    kernel_size_ = 1;
+    stride_ = 1;
+    pad_ = 0;
+  }
+
+  TestCudnnNormConvOpForward(int batch_size, int height, int width,
+                             int input_channels, int output_channels,
+                             int kernel_size, int stride) {
+    batch_size_ = batch_size;
+    height_ = height;
+    width_ = width;
+    input_channels_ = input_channels;
+    output_channels_ = output_channels;
+    kernel_size_ = kernel_size;
+    stride_ = stride;
+    pad_ = (kernel_size_ - 1) / 2;
+  }
+
+  ~TestCudnnNormConvOpForward() {}
+
+  void SetUp() {
+    input_size_ = batch_size_ * height_ * width_ * input_channels_;
+    filter_size_ =
+        output_channels_ * input_channels_ * kernel_size_ * kernel_size_;
+    output_size_ = batch_size_ * height_ * width_ * output_channels_;
+    param_size_ = output_channels_;
+
+    input_vec_.resize(input_size_);
+    filter_raw_vec_.resize(filter_size_);
+    filter_pro_vec_.resize(filter_size_);
+
+    std::default_random_engine random(0);
+    std::uniform_real_distribution<float> dis(0.0, 1.0);
+    for (int i = 0; i < input_size_; ++i) {
+      input_vec_[i] = static_cast<T>(dis(random));
+    }
+    for (int i = 0; i < filter_size_; ++i) {
+      filter_raw_vec_[i] = static_cast<T>(dis(random));
+    }
+    // transpoes for filter
+    // NCHW->NHWC
+    for (int oc = 0; oc < output_channels_; ++oc) {
+      for (int kh = 0; kh < kernel_size_; ++kh) {
+        for (int kw = 0; kw < kernel_size_; ++kw) {
+          for (int ic = 0; ic < input_channels_; ++ic) {
+            int dst_idx = oc * kernel_size_ * kernel_size_ * input_channels_ +
+                          kh * kernel_size_ * input_channels_ +
+                          kw * input_channels_ + ic;
+            int src_idx = oc * kernel_size_ * kernel_size_ * input_channels_ +
+                          ic * kernel_size_ * kernel_size_ + kh * kernel_size_ +
+                          kw;
+            filter_pro_vec_[dst_idx] = filter_raw_vec_[src_idx];
+          }
+        }
+      }
+    }
+
+    framework::TensorFromVector<T>(input_vec_, *ctx_, &input_);
+    input_.Resize({batch_size_, height_, width_, input_channels_});
+    framework::TensorFromVector<T>(filter_raw_vec_, *ctx_, &filter_raw_);
+    filter_raw_.Resize(
+        {output_channels_, input_channels_, kernel_size_, kernel_size_});
+    framework::TensorFromVector<T>(filter_pro_vec_, *ctx_, &filter_pro_);
+    filter_pro_.Resize(
+        {output_channels_, kernel_size_, kernel_size_, input_channels_});
+    output_.Resize({batch_size_, height_, width_, output_channels_});
+    base_output_.Resize({batch_size_, height_, width_, output_channels_});
+    sum_.Resize({1, 1, 1, output_channels_});
+    sum_of_squares_.Resize({1, 1, 1, output_channels_});
+    ctx_->Wait();
+  }
+
+  void BaselineForward() {
+    Conv2DForwardCompute<T>(input_, filter_raw_, &base_output_, *ctx_);
+    ctx_->Wait();
+  }
+
+  // get forward results of cudnn_norm_conv
+  void FusedForward() {
+    auto input_shape = framework::vectorize<int>(input_.dims());
+    auto filter_shape = framework::vectorize<int>(filter_pro_.dims());
+    auto output_shape = framework::vectorize<int>(output_.dims());
+    T *input_ptr = input_.data<T>();
+    T *filter_ptr = filter_pro_.data<T>();
+    T *output_ptr = output_.mutable_data<T>(place_);
+    float *sum_ptr = sum_.mutable_data<float>(place_);
+    float *sum_of_squares_ptr = sum_of_squares_.mutable_data<float>(place_);
+
+    std::shared_ptr<op::CudnnNormConvolutionOp<T>> conv_op(
+        new op::CudnnNormConvolutionOp<T>());
+    conv_op->Init(*ctx_, input_shape, filter_shape, output_shape, pad_, stride_,
+                  dilate_, group_);
+    conv_op->Forward(*ctx_, input_ptr, filter_ptr, output_ptr, sum_ptr,
+                     sum_of_squares_ptr);
+    ctx_->Wait();
+  }
+
+  void Run() {
+    SetUp();
+    BaselineForward();
+    FusedForward();
+  }
+
+  // check forward correctness between baseline and results of normconv.
+  void CheckOut(const T diff, bool is_relative_atol = false) {
+    std::vector<T> base_output_vec, output_vec;
+    output_vec.resize(output_size_);
+    base_output_vec.resize(output_size_);
+    TensorToVector(base_output_, *ctx_, &base_output_vec);
+    TensorToVector(output_, *ctx_, &output_vec);
+    ctx_->Wait();
+
+    for (int i = 0; i < output_size_; ++i) {
+      if (is_relative_atol) {
+        EXPECT_LT(
+            std::abs((output_vec[i] - base_output_vec[i]) / base_output_vec[i]),
+            diff);
+      } else {
+        EXPECT_LT(std::abs(output_vec[i] - base_output_vec[i]), diff);
+      }
+    }
+  }
+
+ private:
+  int batch_size_, height_, width_, input_channels_, output_channels_;
+  int kernel_size_, stride_, pad_;
+  const int dilate_ = 1;
+  const int group_ = 1;
+  int input_size_, filter_size_, output_size_, param_size_;
+
+  framework::Tensor input_, filter_raw_, filter_pro_, output_, base_output_;
+  framework::Tensor sum_, sum_of_squares_;
+  std::vector<T> input_vec_, filter_raw_vec_, filter_pro_vec_;
+
+  platform::CUDAPlace place_ = platform::CUDAPlace(0);
+  platform::CUDADeviceContext *ctx_ =
+      static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place_));
+};
+
+// test for fp16, kernel = 1, output_channels = input_channels
+TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) {
+  int batch_size = 4;
+  int height = 56;
+  int width = 56;
+  int input_channels = 32;
+  int output_channels = 32;
+  int kernel_size = 1;
+  int stride = 1;
+  TestCudnnNormConvOpForward<paddle::platform::float16> test(
+      batch_size, height, width, input_channels, output_channels, kernel_size,
+      stride);
+  test.Run();
+  test.CheckOut(static_cast<paddle::platform::float16>(1e-3), true);
+}
+
+// test for fp16, kernel = 3, output_channels = input_channels
+TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) {
+  int batch_size = 4;
+  int height = 56;
+  int width = 56;
+  int input_channels = 32;
+  int output_channels = 32;
+  int kernel_size = 3;
+  int stride = 1;
+  TestCudnnNormConvOpForward<paddle::platform::float16> test(
+      batch_size, height, width, input_channels, output_channels, kernel_size,
+      stride);
+  test.Run();
+  test.CheckOut(static_cast<paddle::platform::float16>(1e-3), true);
+}
+
+// test for fp16, kernel = 1, output_channels = input_channels * 4
+TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) {
+  int batch_size = 4;
+  int height = 56;
+  int width = 56;
+  int input_channels = 32;
+  int output_channels = 128;
+  int kernel_size = 1;
+  int stride = 1;
+  TestCudnnNormConvOpForward<paddle::platform::float16> test(
+      batch_size, height, width, input_channels, output_channels, kernel_size,
+      stride);
+  test.Run();
+  test.CheckOut(static_cast<paddle::platform::float16>(1e-3), true);
+}
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
new file mode 100644
index 0000000000000..bef0052a00d6b
--- /dev/null
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -0,0 +1,324 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
+#include "paddle/fluid/operators/transpose_op.cu.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class AttnDropoutParam {
+ public:
+  AttnDropoutParam() {
+    is_test_ = false;
+    dropout_implementation_ = "downgrade_in_infer";
+    dropout_prob_ = 0.5;
+    is_upscale_in_train_ = false;
+    is_fix_seed_ = false;
+    seed_val_ = 0;
+    seed_ = nullptr;
+  }
+  AttnDropoutParam(bool is_test, const std::string dropout_implementation,
+                   float dropout_prob, bool is_upscale_in_train,
+                   bool is_fix_seed, int seed_val, const Tensor* seed) {
+    is_test_ = is_test;
+    dropout_implementation_ = dropout_implementation;
+    dropout_prob_ = dropout_prob;
+    is_upscale_in_train_ = is_upscale_in_train;
+    is_fix_seed_ = is_fix_seed;
+    seed_val_ = seed_val;
+    seed_ = seed;
+  }
+  bool is_test_;
+  std::string dropout_implementation_;
+  float dropout_prob_;
+  bool is_upscale_in_train_;
+  bool is_fix_seed_;
+  int seed_val_;
+  const Tensor* seed_;
+};
+
+template <typename T>
+class FMHARef {
+ public:
+  FMHARef(const platform::CUDADeviceContext& dev_ctx, int64_t batch_size,
+          int64_t seq_len, int64_t num_head, int64_t head_dim,
+          AttnDropoutParam param)
+      : dev_ctx_(dev_ctx),
+        batch_size_(batch_size),
+        seq_len_(seq_len),
+        num_head_(num_head),
+        head_dim_(head_dim),
+        dropout_param_(param) {}
+
+  ~FMHARef() {}
+
+  void ComputeForward(const Tensor& qkv_input_tensor,
+                      const Tensor& src_mask_tensor,
+                      Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor,
+                      Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor,
+                      Tensor* dropout_mask_out_tensor,
+                      Tensor* dropout_out_tensor, Tensor* qktv_out_tensor,
+                      Tensor* fmha_out_tensor) {
+    // input shape: [bs, seq_len, 3, num_head, head_dim]
+    // transpose with perm [2, 0, 1, 3, 4],
+    // output_shape: [3, bs, num_head, seq_len, head_dim]
+    int ndims = 5;
+    std::vector<int> perm_1 = {2, 0, 3, 1, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_input_tensor, perm_1,
+                                transpose_2_out_tensor);
+
+    T* qkv_data = transpose_2_out_tensor->data<T>();
+    T* qk_out_data = qk_out_tensor->data<T>();
+    T* qktv_out_data = qktv_out_tensor->data<T>();
+    T* softmax_out_data = softmax_out_tensor->data<T>();
+    T* dropout_out_data = dropout_out_tensor->data<T>();
+    T* fmha_out_data = fmha_out_tensor->data<T>();
+
+    int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
+    int k_size = q_size;
+    T* q_ptr = qkv_data;
+    T* k_ptr = q_ptr + q_size;
+    T* v_ptr = k_ptr + k_size;
+
+    // q*k^t, batched_gemm
+    CBLAS_TRANSPOSE transA = CblasNoTrans;
+    CBLAS_TRANSPOSE transB = CblasTrans;
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    int gemm_batch_size = batch_size_ * num_head_;
+    int gemm_m = seq_len_;
+    int gemm_n = seq_len_;
+    int gemm_k = head_dim_;
+    T alpha = static_cast<T>(1.0 / sqrt(head_dim_));
+    T beta = static_cast<T>(0.0);
+    int64_t stride_a = gemm_m * gemm_k;
+    int64_t stride_b = gemm_k * gemm_n;
+    blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha, q_ptr,
+                     k_ptr, beta, qk_out_data, gemm_batch_size, stride_a,
+                     stride_b);
+
+    std::vector<const Tensor*> ins;
+    std::vector<Tensor*> outs;
+    ins.emplace_back(qk_out_tensor);
+    ins.emplace_back(&src_mask_tensor);
+    outs.emplace_back(src_mask_out_tensor);
+    int elewise_add_axis = -1;
+    int softmax_axis = -1;
+    if (&src_mask_tensor != nullptr) {
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+      SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
+                                        softmax_axis, softmax_out_tensor);
+    } else {
+      SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out_tensor, softmax_axis,
+                                        softmax_out_tensor);
+    }
+
+    transB = CblasNoTrans;
+    gemm_m = seq_len_;
+    gemm_n = head_dim_;
+    gemm_k = seq_len_;
+    alpha = static_cast<T>(1.0);
+    stride_a = gemm_m * gemm_k;
+    stride_b = gemm_k * gemm_n;
+
+    if (dropout_param_.dropout_prob_) {
+      DropoutFwGPUKernelDriver<T>(
+          dev_ctx_, dropout_param_.is_test_,
+          static_cast<const std::string>(
+              dropout_param_.dropout_implementation_),
+          dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
+          dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
+          static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
+          dropout_mask_out_tensor, dropout_out_tensor);
+      blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
+                       dropout_out_data, v_ptr, beta, qktv_out_data,
+                       gemm_batch_size, stride_a, stride_b);
+    } else {
+      // softmax_out * v, batched_gemm
+      // output shape: [batch_size, num_heads, seq_len, head_dim]
+      blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
+                       softmax_out_data, v_ptr, beta, qktv_out_data,
+                       gemm_batch_size, stride_a, stride_b);
+    }
+    // transpose: [0, 2, 1, 3]
+    // output shape: [batch_size, seq_len, num_heads, head_dim]
+    std::vector<int> perm_3 = {0, 2, 1, 3};
+    ndims = 4;
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, *qktv_out_tensor, perm_3,
+                                fmha_out_tensor);
+  }
+
+  void ComputeBackward(
+      const Tensor& transpose_2_out_tensor, const Tensor& src_mask_tensor,
+      const Tensor& softmax_out_tensor, const Tensor& dropout_mask_out_tensor,
+      const Tensor& dropout_out_tensor, const Tensor& qk_out_tensor,
+      const Tensor& src_mask_out_tensor, const Tensor& fmha_out_grad_tensor,
+      Tensor* qktv_out_grad_tensor, Tensor* dropout_out_grad_tensor,
+      Tensor* softmax_out_grad_tensor, Tensor* src_mask_out_grad_tensor,
+      Tensor* qk_out_grad_tensor, Tensor* transpose_2_out_grad_tensor,
+      Tensor* src_mask_grad_tensor, Tensor* qkv_input_grad_tensor) {
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
+    int k_size = q_size;
+    int softmax_axis = -1;
+
+    T* qkv_grad_data = transpose_2_out_grad_tensor->data<T>();
+    T* q_grad_ptr = qkv_grad_data;
+    T* k_grad_ptr = q_grad_ptr + q_size;
+    T* v_grad_ptr = k_grad_ptr + k_size;
+    const T* qkv_data = transpose_2_out_tensor.data<T>();
+    const T* q_ptr = qkv_data;
+    const T* k_ptr = q_ptr + q_size;
+    const T* v_ptr = k_ptr + k_size;
+
+    const T* softmax_out_data = softmax_out_tensor.data<T>();
+    T* softmax_out_grad_data = softmax_out_grad_tensor->data<T>();
+    const T* dropout_out_data = dropout_out_tensor.data<T>();
+    T* dropout_out_grad_data = dropout_out_grad_tensor->data<T>();
+    T* qktv_out_grad_data = qktv_out_grad_tensor->data<T>();
+
+    // transpose bw
+    int ndims = 4;
+    std::vector<int> perm_3 = {0, 2, 1, 3};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, fmha_out_grad_tensor, perm_3,
+                                qktv_out_grad_tensor);
+
+    // recall batchedgemm(nn) fw: softmax_out_data(x) * v_ptr(y) =
+    // qktv_out_data(out)
+    CBLAS_TRANSPOSE transA = CblasTrans;
+    CBLAS_TRANSPOSE transB = CblasNoTrans;
+    int gemm_batch_size = batch_size_ * num_head_;
+    int gemm_m = seq_len_;
+    int gemm_n = head_dim_;
+    int gemm_k = seq_len_;
+    T alpha = static_cast<T>(1.0);
+    T beta = static_cast<T>(0.0);
+    int64_t stride_a = gemm_m * gemm_k;
+    int64_t stride_b = gemm_k * gemm_n;
+    // bw: dy = x^t * dout
+    if (dropout_param_.dropout_prob_) {
+      blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
+                       dropout_out_data, qktv_out_grad_data, beta, v_grad_ptr,
+                       gemm_batch_size, stride_a, stride_b);
+    } else {
+      blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
+                       softmax_out_data, qktv_out_grad_data, beta, v_grad_ptr,
+                       gemm_batch_size, stride_a, stride_b);
+    }
+    // bw: dx = dout * y^t
+    transA = CblasNoTrans;
+    transB = CblasTrans;
+    gemm_m = seq_len_;
+    gemm_n = seq_len_;
+    gemm_k = head_dim_;
+    stride_a = gemm_m * gemm_k;
+    stride_b = gemm_k * gemm_n;
+    if (dropout_param_.dropout_prob_) {
+      blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
+                       qktv_out_grad_data, v_ptr, beta, dropout_out_grad_data,
+                       gemm_batch_size, stride_a, stride_b);
+    } else {
+      blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
+                       qktv_out_grad_data, v_ptr, beta, softmax_out_grad_data,
+                       gemm_batch_size, stride_a, stride_b);
+    }
+    // dropout bw
+    if (dropout_param_.dropout_prob_) {
+      DropoutGradGPUKernelDriver<T>(
+          dev_ctx_, static_cast<const std::string>(
+                        dropout_param_.dropout_implementation_),
+          dropout_param_.dropout_prob_,
+          static_cast<const Tensor&>(*dropout_out_grad_tensor),
+          dropout_mask_out_tensor, softmax_out_grad_tensor->numel(),
+          softmax_out_grad_tensor);
+    }
+
+    if (&src_mask_tensor != nullptr) {
+      SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
+                                         *softmax_out_grad_tensor, softmax_axis,
+                                         src_mask_out_grad_tensor);
+
+      // recall LaunchElementwiseCudaKernel fw:  src_mask_out = qk_out +
+      // src_mask
+      // Special case when dy is not needed and dx doesn't reduce
+      if (qk_out_grad_tensor != nullptr && src_mask_grad_tensor == nullptr &&
+          qk_out_tensor.dims() == src_mask_out_tensor.dims()) {
+        VLOG(4) << "Special case when dy is not needed and dx doesn't "
+                   "reduce";
+        framework::TensorCopy(*src_mask_out_grad_tensor, dev_ctx_.GetPlace(),
+                              dev_ctx_, qk_out_grad_tensor);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Only used for the backward elementwise_add op when"
+            "dy is not needed and dx is not reduce"));
+        return;
+      }
+
+    } else {
+      SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
+                                         *softmax_out_grad_tensor, softmax_axis,
+                                         qk_out_grad_tensor);
+    }
+
+    T* qk_out_grad_data = qk_out_grad_tensor->data<T>();
+    alpha = static_cast<T>(1.0 / sqrt(head_dim_));
+    // recall batchedgemm(nt) fw:  q_ptr * (k_ptr)^t = qk_out
+    // bw: dy (seq_len * head_dim) = (dout)^t * x
+    transA = CblasTrans;
+    transB = CblasNoTrans;
+    gemm_m = seq_len_;
+    gemm_n = head_dim_;
+    gemm_k = seq_len_;
+    stride_a = gemm_m * gemm_k;
+    stride_b = gemm_k * gemm_n;
+    blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
+                     qk_out_grad_data, q_ptr, beta, k_grad_ptr, gemm_batch_size,
+                     stride_a, stride_b);
+    // dx (seq_len * head_dim) = dout * y
+    transA = CblasNoTrans;
+    transB = CblasNoTrans;
+    gemm_m = seq_len_;
+    gemm_n = head_dim_;
+    gemm_k = seq_len_;
+    stride_a = gemm_m * gemm_k;
+    stride_b = gemm_k * gemm_n;
+    blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
+                     qk_out_grad_data, k_ptr, beta, q_grad_ptr, gemm_batch_size,
+                     stride_a, stride_b);
+
+    // transpose bw
+    ndims = 5;
+    std::vector<int> perm_1 = {1, 3, 0, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, *transpose_2_out_grad_tensor,
+                                perm_1, qkv_input_grad_tensor);
+  }
+
+ private:
+  const platform::CUDADeviceContext& dev_ctx_;
+
+  int64_t batch_size_;
+  int64_t seq_len_;
+  int64_t num_head_;
+  int64_t head_dim_;
+
+  AttnDropoutParam dropout_param_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
index 7d815bb8c3993..994601a2f0608 100755
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -17,8 +17,7 @@ limitations under the License. */
 #define _USE_MATH_DEFINES
 #endif
 
-#include "paddle/fluid/operators/fused/fused_dropout_common.h"
-#include "paddle/fluid/operators/math/functors.h"
+#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
 
 namespace paddle {
 namespace operators {
@@ -75,66 +74,15 @@ __global__ void FusedDropoutActBias(
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx, increment, &state);
 
-  T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
-  if (!is_upscale_in_train) {
-    factor = static_cast<T>(1.0);
-  }
-  if (is_test) {
-    factor = static_cast<T>(1.0f - dropout_prob);
-    if (is_upscale_in_train) {
-      factor = static_cast<T>(1.0f);
-    }
-  }
-
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  const T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
 
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      LoadT src_vec;
-      LoadT bias_vec;
-      // vectorize load data from global
-      platform::Load<T, VecSize>(&src[r * cols + i], &src_vec);
-
-      if (bias) {
-        platform::Load<T, VecSize>(&bias[i], &bias_vec);
-      } else {
-#pragma unroll
-        for (int ii = 0; ii < VecSize; ii++) {
-          bias_vec[ii] = static_cast<T>(0);
-        }
-      }
-
-      MaskStoreT mask_vec;
-      if (!is_test) {
-        float rand[VecSize];
-        RandVec<VecSize>(&state, rand);
-#pragma unroll
-        for (int ii = 0; ii < VecSize; ii++) {
-          mask_vec[ii] = static_cast<MaskType>(rand[ii] >= dropout_prob);
-        }
-      } else {
-#pragma unroll
-        for (int ii = 0; ii < VecSize; ii++) {
-          mask_vec[ii] = static_cast<MaskType>(1);
-        }
-      }
-
-      StoreT dest_vec;
-#pragma unroll
-      for (int ii = 0; ii < VecSize; ii++) {
-        const T tmp = src_vec[ii] + bias_vec[ii];
-        const T act_out = act(tmp);
-        dest_vec[ii] = act_out * static_cast<T>(mask_vec[ii]) * factor;
-      }
-      // store result to global
-      platform::Store<T, VecSize>(dest_vec, &dst[r * cols + i]);
-      if (!is_test) {
-        platform::Store<MaskType, VecSize>(mask_vec, &mask[r * cols + i]);
-      }
+      FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, false, true,
+                                        Functor>(
+          r, i, cols, &state, dropout_prob, factor, src, nullptr, bias, dst,
+          mask, is_test, nullptr, nullptr, act);
     }
   }
 }
@@ -197,10 +145,8 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
     StoreT dx_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
-      T args[2];
-      args[0] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
-      args[1] = src_vec[ii];
-      dx_vec[ii] = args[0] * act_grad.UseOut(args[1]);
+      T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
+      dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);
     }
     platform::Store<T, VecSize>(dx_vec, &dx[i]);
   }
@@ -243,10 +189,8 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
 #pragma unroll
       for (int i = 0; i < VecSize; i++) {
         T val;
-        T args[2];
-        args[0] = dout_vec[i] * static_cast<T>(mask_vec[i]) * factor;
-        args[1] = src_vec[i] + bias_vec[i];
-        val = args[0] * act_grad.UseOut(args[1]);
+        T tmp = dout_vec[i] * static_cast<T>(mask_vec[i]) * factor;
+        val = tmp * act_grad.UseOut(src_vec[i] + bias_vec[i]);
         dx_vec[i] = val;
         tmp_sum[i] += val;
       }
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 02c3a2d6f1a12..3fb58eab077bc 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -133,5 +134,17 @@ inline __device__ void CalculateDBias(const T *tmp_sum, T *dbias,
   }
 }
 
+template <typename T>
+inline __device__ T GetFactor(const float dropout_prob,
+                              const bool is_upscale_in_train,
+                              const bool is_test) {
+  T factor = is_upscale_in_train ? static_cast<T>(1.0f / (1.0f - dropout_prob))
+                                 : static_cast<T>(1.0f);
+  if (is_test) {
+    factor = is_upscale_in_train ? static_cast<T>(1.0f)
+                                 : static_cast<T>(1.0f - dropout_prob);
+  }
+  return factor;
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index eae2f5457b07f..a0d1cd43404eb 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -31,6 +32,12 @@ namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
 USE_OP(dropout);
+USE_OP(layer_norm);
+
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 /**
  * @brief call paddle dropout op
@@ -116,6 +123,60 @@ void DropoutGrad(std::vector<T> *dx, const framework::DDim &x_dim,
   ctx.Wait();
 }
 
+/**
+ * @brief call paddle layer_norm op
+ */
+template <typename T>
+void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
+               const std::vector<LayerNormParamType<T>> &bias,
+               const std::vector<T> &x,
+               std::vector<LayerNormParamType<T>> *means,
+               std::vector<LayerNormParamType<T>> *vars, std::vector<T> *y,
+               const float epsilon, const int rows, const int cols,
+               const platform::CUDADeviceContext &ctx) {
+  framework::Scope scope;
+  auto place = ctx.GetPlace();
+  if (scale.size() > 0) {
+    auto var_scale = scope.Var("Scale");
+    auto tensor_scale = var_scale->GetMutable<framework::LoDTensor>();
+    framework::TensorFromVector(scale, ctx, tensor_scale);
+    tensor_scale->Resize({cols});
+  }
+
+  if (bias.size() > 0) {
+    auto var_bias = scope.Var("Bias");
+    auto tensor_bias = var_bias->GetMutable<framework::LoDTensor>();
+    framework::TensorFromVector(bias, ctx, tensor_bias);
+    tensor_bias->Resize({cols});
+  }
+
+  auto var_x = scope.Var("X");
+  auto tensor_x = var_x->GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector(x, ctx, tensor_x);
+  tensor_x->Resize({rows, cols});
+
+  auto var_y = scope.Var("Y");
+  auto tensor_y = var_y->GetMutable<framework::LoDTensor>();
+
+  auto var_mean = scope.Var("Mean");
+  auto tensor_mean = var_mean->GetMutable<framework::LoDTensor>();
+
+  auto var_variance = scope.Var("Variance");
+  auto tensor_variance = var_variance->GetMutable<framework::LoDTensor>();
+
+  framework::AttributeMap attrs;
+  attrs.insert({"epsilon", epsilon});
+
+  auto op = framework::OpRegistry::CreateOp(
+      "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}},
+      {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs);
+  op->Run(scope, place);
+  framework::TensorToVector(*tensor_y, ctx, y);
+  framework::TensorToVector(*tensor_mean, ctx, means);
+  framework::TensorToVector(*tensor_variance, ctx, vars);
+  ctx.Wait();
+}
+
 template <typename T>
 inline void ReduceSum(const std::vector<T> &dout, std::vector<T> *dbias,
                       const int rows, const int cols) {
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
new file mode 100644
index 0000000000000..f257d3efa433e
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -0,0 +1,209 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+/**
+ * @brief fused add_bias, dropout, add residual and leyer_norm into one
+ * operators. Currently only support forward
+ */
+
+template <typename T, int VecSize>
+__device__ void CalcLayernormY(const LayerNormParamType<T> *scale,
+                               const LayerNormParamType<T> *bias, const T *x,
+                               T *y, const int row_id, const int col_id,
+                               const int cols,
+                               const LayerNormParamType<T> mean_val,
+                               const LayerNormParamType<T> invvar) {
+  using U = LayerNormParamType<T>;
+  using LoadT = platform::AlignedVector<T, VecSize>;
+  using StoreT = platform::AlignedVector<T, VecSize>;
+  using LoadU = platform::AlignedVector<U, VecSize>;
+  for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
+    LoadU scale_vec;
+    LoadU bias_vec;
+    LoadT x_vec;
+#pragma unroll
+    for (int ii = 0; ii < VecSize; ii++) {
+      scale_vec[ii] = static_cast<U>(1);
+      bias_vec[ii] = static_cast<U>(0);
+    }
+    // vectorize load data from global
+    platform::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
+
+    if (scale != nullptr) {
+      platform::Load<U, VecSize>(&scale[i], &scale_vec);
+    }
+    if (bias != nullptr) {
+      platform::Load<U, VecSize>(&bias[i], &bias_vec);
+    }
+
+    StoreT y_vec;
+    for (int ii = 0; ii < VecSize; ii++) {
+      y_vec[ii] = static_cast<T>(
+          scale_vec[ii] * (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
+          bias_vec[ii]);
+    }
+    platform::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
+  }
+}
+
+/**
+ * @brief layernorm(residual + dropout(src + bias));
+ * @param
+ * rows: batch_size * seq_len
+ * cols: feature_size or hidden_size
+ * src: [rows, cols], inputs
+ * bias: [cols], linear bias, can be null
+ * residual:[rows, cols]
+ * mask: [rows, cols], dropout result
+ * dst: [rows, cols], residual + dropout(src+bias)
+ * layernorm_dst: [rows, cols], layernorm result
+ * layernorm_bias: [cols], layernorm bias, can be null
+ * scale: [cols]: layernorm scale, can be null
+ * means: [rows]: layernorm means
+ * vars: [rows]: layernorm vars
+ */
+template <typename T, typename MaskType, int VecSize>
+__global__ void FusedLayernormResidualDropoutBias(
+    const size_t rows, const size_t cols, uint64_t seed,
+    const float dropout_prob, const bool is_upscale_in_train,
+    const bool is_test, const uint64_t increment, const float epsilon,
+    const T *src, const T *residual, const T *bias,
+    const LayerNormParamType<T> *scale,
+    const LayerNormParamType<T> *layernorm_bias, MaskType *mask, T *dst,
+    T *layernorm_dst, LayerNormParamType<T> *mean, LayerNormParamType<T> *var) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+  int idx = row_id * cols + col_id;
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, increment, &state);
+
+  T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
+  using U = LayerNormParamType<T>;
+
+  __shared__ U mean_share;
+  __shared__ U var_share;
+  __shared__ U shared_mean[32];
+  __shared__ U shared_var[32];
+
+  math::ReluFunctor<T> relu;
+  U mean_val = 0;
+  U var_val = 0;
+  for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
+    FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, true, false,
+                                      math::ReluFunctor<T>>(
+        row_id, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
+        mask, is_test, &mean_val, &var_val, relu);
+  }
+
+  mean_val = BlockReduceSum<U>(mean_val, shared_mean);
+  var_val = BlockReduceSum<U>(var_val, shared_var);
+  if (threadIdx.x == 0) {
+    auto scale = static_cast<float>(1.) / static_cast<float>(cols);
+    auto tmp = mean_val * scale;
+    mean[row_id] = mean_share = static_cast<U>(tmp);
+    var_share = static_cast<U>(var_val * scale - mean_share * mean_share);
+    var_share = var_share > U(0) ? var_share : U(0);
+    var[row_id] = var_share;
+  }
+  __syncthreads();
+
+  mean_val = mean_share;
+  U invvar = rsqrt_<U>(var_share + static_cast<U>(epsilon));
+
+  // calculate layernorm_dst
+  CalcLayernormY<T, VecSize>(scale, layernorm_bias, dst, layernorm_dst, row_id,
+                             col_id, cols, mean_val, invvar);
+}
+
+/**
+ * @brief layernorm(residual + dropout(src + bias));
+ * @param
+ * rows: batch_size * seq_len
+ * cols: feature_size or hidden_size
+ * src: [rows, cols], inputs
+ * bias: [cols], linear bias, can be null
+ * residual:[rows, cols]
+ * mask: [rows, cols], dropout result, can be null if is_test = true
+ * dst: [rows, cols], residual + dropout(src+bias)
+ * layernorm_dst: [rows, cols], layernorm result
+ * layernorm_bias: [cols], layernorm bias, can be null
+ * scale: [cols]: layernorm scale, can be null
+ * means: [rows]: layernorm means
+ * vars: [rows]: layernorm vars
+ */
+template <typename T, typename MaskType>
+void LaunchLayernormResidualDropoutBias(
+    const uint32_t rows, const uint32_t cols, const int increment,
+    uint64_t seed, const float dropout_prob, const float epsilon,
+    const bool is_upscale_in_train, const bool is_test, const T *src,
+    const T *residual, const T *bias, const LayerNormParamType<T> *scale,
+    const LayerNormParamType<T> *layernorm_bias, MaskType *mask_data, T *dst,
+    T *layernorm_dst, LayerNormParamType<T> *mean, LayerNormParamType<T> *var,
+    const platform::CUDADeviceContext &ctx) {
+  using U = LayerNormParamType<T>;
+  // dropout_prob == 1.0f
+  if (std::abs(dropout_prob - 1.0f) < 1e-5) {
+    auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T),
+                 ctx.stream());
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
+        mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+
+    // call layernorm forward
+    switch (GetDesiredBlockDim(cols)) {
+      FIXED_BLOCK_DIM_CASE(
+          LayerNormForward<T, U,
+                           kBlockDim><<<rows, kBlockDim, 0, ctx.stream()>>>(
+              dst, scale, layernorm_bias, layernorm_dst, mean, var, epsilon,
+              cols));
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Product from begin_norm_axis to end must be larger than 1"));
+        break;
+    }
+    return;
+  }
+
+  const int VecSize = MAX_CACHE_BYTES / sizeof(T);
+  if (cols % VecSize != 0) {
+    int blockDim = GetDesiredBlockDim(cols);
+    FusedLayernormResidualDropoutBias<T, uint8_t,
+                                      1><<<rows, blockDim, 0, ctx.stream()>>>(
+        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
+        epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
+        layernorm_dst, mean, var);
+  } else {
+    int blockDim = GetDesiredBlockDim(cols / VecSize);
+    FusedLayernormResidualDropoutBias<
+        T, uint8_t, VecSize><<<rows, blockDim, 0, ctx.stream()>>>(
+        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
+        epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
+        layernorm_dst, mean, var);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
new file mode 100644
index 0000000000000..50e3555b4bcd6
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -0,0 +1,332 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+
+#include <random>
+#include <vector>
+
+#include "paddle/fluid/operators/fused/fused_dropout_test.h"
+#include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
+
+/**
+ * @brief The unit test of fused_layernorm_residual_dropout_bias
+ */
+
+template <typename T>
+struct TestFusedLayernormResidualDropoutBias {
+  uint32_t rows;
+  uint32_t cols;
+  uint64_t seed;
+  float dropout_prob, epsilon;
+  bool is_upscale_in_train;
+  bool is_test;  // default false,  Set to true for inference only
+  bool has_bias = true;
+  bool has_scale = true;
+  bool has_layernorm_bias = true;
+  framework::Tensor src, residual, bias, out, mask, scale, layernorm_bias,
+      layernorm_out, means, vars;
+  framework::Tensor dsrc, dbias;
+
+  std::vector<T> src_vec, residual_vec, bias_vec;
+  std::vector<LayerNormParamType<T>> means_vec, vars_vec, scale_vec,
+      layernorm_bias_vec;
+  std::vector<T> correct_out, correct_dsrc, correct_dbias,
+      correct_layernorm_out;
+  std::vector<LayerNormParamType<T>> correct_means, correct_vars;
+  std::vector<uint8_t> correct_mask;
+
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext *ctx;
+
+  TestFusedLayernormResidualDropoutBias() {
+    rows = 32;
+    cols = 32;
+    seed = 0;
+    dropout_prob = 0.0;
+    is_upscale_in_train = false;
+    is_test = false;
+    has_bias = true;
+    has_scale = true;
+    has_layernorm_bias = true;
+    epsilon = 0.00001f;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto devicectx = pool.Get(place);
+    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
+  }
+
+  TestFusedLayernormResidualDropoutBias(int _rows, int _cols,
+                                        uint64_t _seed = 0,
+                                        float _dropout_prob = 0.0,
+                                        float _epsilon = 0.00001f,
+                                        bool _is_upscale_in_train = false,
+                                        bool _is_test = false) {
+    rows = _rows;
+    cols = _cols;
+    seed = _seed;
+    dropout_prob = _dropout_prob;
+    epsilon = _epsilon;
+    is_upscale_in_train = _is_upscale_in_train;
+    is_test = _is_test;
+    has_bias = true;
+    has_scale = true;
+    has_layernorm_bias = true;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto devicectx = pool.Get(place);
+    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
+  }
+
+  ~TestFusedLayernormResidualDropoutBias() {}
+
+  void SetUp() {
+    using U = LayerNormParamType<T>;
+    const int n = rows * cols;
+    correct_out.resize(n);
+    correct_mask.resize(n);
+    correct_dsrc.resize(n);
+    correct_dbias.resize(cols);
+    correct_means.resize(rows);
+    correct_vars.resize(rows);
+    correct_layernorm_out.resize(n);
+
+    src_vec.resize(n);
+    residual_vec.resize(n);
+    if (has_bias) {
+      bias_vec.resize(cols);
+    }
+    if (has_scale) {
+      scale_vec.resize(cols);
+    }
+    if (has_layernorm_bias) {
+      layernorm_bias_vec.resize(cols);
+    }
+    std::default_random_engine random(time(NULL));
+    std::uniform_real_distribution<float> dis(0.0, 1.0);
+
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        src_vec[i * cols + j] = static_cast<T>(dis(random));
+        residual_vec[i * cols + j] = static_cast<T>(dis(random));
+        if (i == 0) {
+          if (has_bias) {
+            bias_vec[j] = static_cast<T>(dis(random));
+          }
+          if (has_scale) {
+            scale_vec[j] = static_cast<U>(dis(random));
+          }
+          if (has_layernorm_bias) {
+            layernorm_bias_vec[j] = static_cast<U>(dis(random));
+          }
+        }
+      }
+    }
+
+    framework::TensorFromVector<T>(src_vec, *ctx, &src);
+    src.Resize({rows, cols});
+    framework::TensorFromVector<T>(residual_vec, *ctx, &residual);
+    residual.Resize({rows, cols});
+    if (has_bias) {
+      framework::TensorFromVector<T>(bias_vec, *ctx, &bias);
+      bias.Resize({cols});
+    }
+    if (has_scale) {
+      framework::TensorFromVector<U>(scale_vec, *ctx, &scale);
+      scale.Resize({cols});
+    }
+    if (has_layernorm_bias) {
+      framework::TensorFromVector<U>(layernorm_bias_vec, *ctx, &layernorm_bias);
+      layernorm_bias.Resize({cols});
+    }
+
+    {
+      out.Resize({rows, cols});
+      out.mutable_data<T>(place);
+      mask.Resize({rows, cols});
+      mask.mutable_data<uint8_t>(place);
+      means.Resize({rows});
+      means.mutable_data<U>(place);
+      vars.Resize({rows});
+      vars.mutable_data<U>(place);
+      layernorm_out.Resize({rows, cols});
+      layernorm_out.mutable_data<T>(place);
+      dsrc.Resize({rows, cols});
+      dsrc.mutable_data<T>(place);
+
+      if (has_bias) {
+        dbias.Resize({cols});
+        dbias.mutable_data<T>(place);
+      }
+    }
+  }
+
+  void BaseForward() {
+    using U = LayerNormParamType<T>;
+    std::vector<T> out1(rows * cols), out2(rows * cols);
+    if (has_bias) {
+      // add bias
+      for (int i = 0; i < rows; i++) {
+        for (int j = 0; j < cols; j++) {
+          out1[i * cols + j] = src_vec[i * cols + j] + bias_vec[j];
+        }
+      }
+      // call dropout
+      Dropout<T>(out1, src.dims(), &out2, &correct_mask, *ctx, seed,
+                 dropout_prob, is_upscale_in_train, is_test);
+    } else {
+      Dropout<T>(src_vec, src.dims(), &out2, &correct_mask, *ctx, seed,
+                 dropout_prob, is_upscale_in_train, is_test);
+    }
+    // add residual
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        correct_out[i * cols + j] =
+            residual_vec[i * cols + j] + out2[i * cols + j];
+      }
+    }
+
+    LayerNorm<T>(scale_vec, layernorm_bias_vec, correct_out, &correct_means,
+                 &correct_vars, &correct_layernorm_out, epsilon, rows, cols,
+                 *ctx);
+    ctx->Wait();
+  }
+
+  void FusedForward() {
+    using U = LayerNormParamType<T>;
+    int VecSize = MAX_CACHE_BYTES / sizeof(T);
+    if (cols % 4 != 0) {
+      VecSize = 1;
+    }
+    int threads = paddle::operators::GetDesiredBlockDim(cols / VecSize);
+    const int increment = ((cols - 1) / (threads * VecSize) + 1) * VecSize;
+
+    T *bias_ptr = nullptr;
+    U *scale_ptr = nullptr;
+    U *layernorm_bias_ptr = nullptr;
+    if (has_bias) {
+      bias_ptr = bias.data<T>();
+    }
+    if (has_scale) {
+      scale_ptr = scale.data<U>();
+    }
+    if (has_layernorm_bias) {
+      layernorm_bias_ptr = layernorm_bias.data<U>();
+    }
+
+    paddle::operators::LaunchLayernormResidualDropoutBias<T, uint8_t>(
+        rows, cols, increment, seed, dropout_prob, epsilon, is_upscale_in_train,
+        is_test, src.data<T>(), residual.data<T>(), bias_ptr, scale_ptr,
+        layernorm_bias_ptr, mask.data<uint8_t>(), out.data<T>(),
+        layernorm_out.data<T>(), means.data<U>(), vars.data<U>(), *ctx);
+    ctx->Wait();
+  }
+
+  void Run() {
+    SetUp();
+    BaseForward();
+    FusedForward();
+  }
+
+  void CheckOut(const T diff) {
+    using U = LayerNormParamType<T>;
+    const int n = rows * cols;
+    std::vector<T> _out(n), _layernorm_out(n);
+    std::vector<U> _means(rows), _vars(cols);
+    std::vector<uint8_t> _mask(n);
+    framework::TensorToVector(out, *ctx, &_out);
+    framework::TensorToVector(layernorm_out, *ctx, &_layernorm_out);
+    framework::TensorToVector(means, *ctx, &_means);
+    framework::TensorToVector(vars, *ctx, &_vars);
+    if (!is_test) {
+      framework::TensorToVector(mask, *ctx, &_mask);
+    }
+    ctx->Wait();
+
+    for (int i = 0; i < n; i++) {
+      EXPECT_LT(std::abs(_out[i] - correct_out[i]), diff);
+      EXPECT_LT(std::abs(_layernorm_out[i] - correct_layernorm_out[i]), diff);
+      if (!is_test) EXPECT_EQ(_mask[i], correct_mask[i]);
+    }
+    for (int i = 0; i < rows; i++) {
+      EXPECT_LT(std::abs(_means[i] - correct_means[i]), static_cast<U>(diff));
+      EXPECT_LT(std::abs(_vars[i] - correct_vars[i]), static_cast<U>(diff));
+    }
+  }
+};
+
+template <typename T>
+static void BaseTest(const bool is_fp16 = false) {
+  const int rows = 16;
+  T default_diff = !is_fp16 ? static_cast<T>(1e-4) : static_cast<T>(1e-2);
+  for (auto cols : {16, 17}) {
+    for (auto has_bias : {true, false}) {
+      for (auto has_scale : {true, false}) {
+        for (auto has_layernorm_bias : {true, false}) {
+          TestFusedLayernormResidualDropoutBias<T> test(rows, cols);
+          test.has_bias = has_bias;
+          test.has_scale = has_scale;
+          test.has_layernorm_bias = has_layernorm_bias;
+          test.Run();
+          test.CheckOut(default_diff);
+        }
+      }
+    }
+  }
+}
+
+TEST(FusedDropout, GPUFusedLayernormResidualDropoutBias) { BaseTest<float>(); }
+
+TEST(FusedDropout, GPUFusedLayernormResidualDropoutBiasDouble) {
+  BaseTest<double>();
+}
+
+TEST(FusedDropout, GPUFusedLayernormResidualDropoutBiasFp16) {
+  BaseTest<platform::float16>(true);
+}
+
+TEST(FusedDropout, GPUFusedLayernormResidualDropoutBiasIsUpscaleInTrain) {
+  const int rows = 16;
+  const int cols = 16;
+  for (auto is_upscale_in_train : {true, false}) {
+    TestFusedLayernormResidualDropoutBias<float> test(
+        rows, cols, 0, 1.0, 0.00001f, is_upscale_in_train, false);
+    test.Run();
+    test.CheckOut(static_cast<float>(1e-4));
+  }
+}
+
+TEST(FusedDropout, GPUFusedLayernormResidualDropoutBiasIsTest) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedLayernormResidualDropoutBias<float> test(rows, cols, 0, 0.35,
+                                                    0.00001f, true, true);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-4));
+}
+
+TEST(FusedDropout, GPUFusedLayernormResidualDropoutBiasSeed) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedLayernormResidualDropoutBias<float> test(rows, cols, 125, 0.0,
+                                                    0.00001f, false, false);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-4));
+}
+
+TEST(FusedDropout, GPUFusedLayernormResidualDropoutLargeShape) {
+  const int rows = 512;
+  const int cols = 512;
+  TestFusedLayernormResidualDropoutBias<float> test(rows, cols);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-4));
+}
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 0230244c98155..d984ad1a27768 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -23,14 +23,15 @@ namespace operators {
  * @brief The fused function called by every thread
  * VecSize can be 1, 2, 4 or 8
  */
-template <typename T, typename MaskType, int VecSize, bool ComputeLayerNorm>
+template <typename T, typename MaskType, int VecSize, bool ComputeLayerNorm,
+          bool Activation, typename Functor>
 __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const int row_id, const int col_id, const int cols,
     curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor,
     const T *__restrict__ src, const T *__restrict__ residual,
     const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test,
     typename details::MPTypeTrait<T>::Type *mean_val,
-    typename details::MPTypeTrait<T>::Type *var_val) {
+    typename details::MPTypeTrait<T>::Type *var_val, Functor act_func) {
   using LoadT = platform::AlignedVector<T, VecSize>;
   using StoreT = platform::AlignedVector<T, VecSize>;
   using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
@@ -42,10 +43,14 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
 #pragma unroll
   for (int ii = 0; ii < VecSize; ii++) {
     bias_vec[ii] = static_cast<T>(0);
+    residual_vec[ii] = static_cast<T>(0);
   }
   // vectorize load data from global
   platform::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
-  platform::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
+  if (residual) {
+    platform::Load<T, VecSize>(&residual[row_id * cols + col_id],
+                               &residual_vec);
+  }
 
   if (bias) {
     platform::Load<T, VecSize>(&bias[col_id], &bias_vec);
@@ -70,9 +75,12 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
 
 #pragma unroll
   for (int ii = 0; ii < VecSize; ii++) {
+    T tmp = src_vec[ii] + bias_vec[ii];
+    if (Activation) {
+      tmp = act_func(tmp);
+    }
     dest_vec[ii] =
-        (src_vec[ii] + bias_vec[ii]) * static_cast<T>(mask_vec[ii]) * factor +
-        residual_vec[ii];
+        tmp * static_cast<T>(mask_vec[ii]) * factor + residual_vec[ii];
     if (ComputeLayerNorm) {
       U tmp = static_cast<U>(dest_vec[ii]);
       *mean_val += tmp;
@@ -106,19 +114,15 @@ __global__ void FusedResidualDropoutBias(
   int idx = row_id * cols + col_id;
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx, increment, &state);
-
-  T factor = is_upscale_in_train ? static_cast<T>(1.0f / (1.0f - dropout_prob))
-                                 : static_cast<T>(1.0f);
-  if (is_test) {
-    factor = is_upscale_in_train ? static_cast<T>(1.0f)
-                                 : static_cast<T>(1.0f - dropout_prob);
-  }
+  const T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
+  math::ReluFunctor<T> relu;
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, false>(
+      FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, false, false,
+                                        math::ReluFunctor<T>>(
           r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
-          mask, is_test, nullptr, nullptr);
+          mask, is_test, nullptr, nullptr, relu);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index d44df536bdd10..1a12e6b565f02 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -165,6 +165,7 @@ struct TestFusedResidualDropoutBias {
     auto config = paddle::operators::Get1DBlocksAnd2DGrids(
         *ctx, static_cast<uint64_t>(rows), static_cast<uint64_t>(cols),
         VecSize);
+
     const int increment = ((cols - 1) / (config.thread_per_block.x *
                                          config.block_per_grid.x * VecSize) +
                            1) *
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index c19e621b18fa7..69056189ac221 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -132,6 +132,24 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size,
   }
 }
 
+inline int round_up(int seq_len, int multiple = 32) {
+  PADDLE_ENFORCE_GT(
+      multiple, 0,
+      platform::errors::InvalidArgument(
+          "multiple should be a positive number，but it's (%d)", multiple));
+  return ((seq_len + multiple - 1) / multiple) * multiple;
+}
+
+template <typename T>
+__global__ void broadcast(const T *src, T *dst, const int seq_len,
+                          const int head_num) {
+  int batch_id = blockIdx.x / (head_num * seq_len);
+  int dst_offset = blockIdx.x * seq_len;
+  if (threadIdx.x < seq_len) {
+    dst[threadIdx.x + dst_offset] = src[threadIdx.x + batch_id * seq_len];
+  }
+}
+
 template <typename DeviceContext, typename T>
 class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
  public:
@@ -152,6 +170,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     int head_number = context.Attr<int>("head_number");
     // compute q*k with eltadd
     auto &device_ctx = context.template device_context<DeviceContext>();
+    auto stream = device_ctx.stream();
     // should be (B * S * hidden)
     auto input_dims = input->dims();
     // shouble be (hidden * 3 * all_head_size)
@@ -159,7 +178,17 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     int batch = input_dims[0];
     int seq_len = input_dims[1];
     int hidden = input_dims[2];
-
+    Tensor temp_bias_tensor;
+    // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
+    if (bias_qk.numel() == (batch * seq_len)) {
+      temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
+      auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
+      int grid = batch * head_number * seq_len;
+      int block = round_up(seq_len);
+      broadcast<<<grid, block, 0, stream>>>(bias_qk_d, temp_qk_bias, seq_len,
+                                            head_number);
+      bias_qk_d = static_cast<const T *>(temp_qk_bias);
+    }
     int all_head_size = w_dims[2];
     int head_size = all_head_size / head_number;
 
@@ -196,7 +225,6 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     auto *qkptr = multihead_temp_data;
     auto *tptr = multihead_temp_data + scratch_size;
 
-    auto stream = device_ctx.stream();
     // Do the transpose with bias.
     // BxSx3xNxH => tptr: 3xBxNxSxH.
     TransQKVWithBias(batch, seq_len, head_size, head_number, temp_out_data,
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
index bf0ac667411d8..6b6290d4af29f 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -130,7 +130,6 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
     T* random_data =
         random_tensor.mutable_data<T>({size}, platform::CUDAPlace());
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    const unsigned int seed = std::random_device()();
 
     // generate gumbel noise
     int device_id =
@@ -144,6 +143,7 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
           thrust::device_ptr<T>(random_data),
           UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset));
     } else {
+      const unsigned int seed = std::random_device()();
       thrust::transform(index_sequence_begin, index_sequence_begin + size,
                         thrust::device_ptr<T>(random_data),
                         UniformCUDAGenerator<T>(0.00001, 1, seed));
diff --git a/paddle/fluid/operators/gumbel_softmax_op.h b/paddle/fluid/operators/gumbel_softmax_op.h
index c224cc7ca1dd9..f95a4810f4442 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.h
+++ b/paddle/fluid/operators/gumbel_softmax_op.h
@@ -86,8 +86,7 @@ struct GumbleNoiseGenerator<platform::CPUDeviceContext, T> {
     // generate uniform random number
     const int size = size_to_axis * size_from_axis;
     std::uniform_real_distribution<T> dist(0.00001, 1);
-    const int seed = std::random_device()();
-    auto engine = paddle::framework::GetCPURandomEngine(seed);
+    auto engine = paddle::framework::GetCPURandomEngine(0);
     Tensor random_tensor;
     auto* random_data =
         random_tensor.mutable_data<T>({size}, platform::CPUPlace());
diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
index 2898a11fd7a60..a36c76d788173 100644
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
@@ -54,8 +54,8 @@ class MPTypeTrait<platform::float16> {
 };
 
 /**
- * @brief will be used in BlockYReduce, get the index of reduce_num in shared
- * memory
+ * @brief Will be used in BlockYReduce, get the index of reduce_num in shared
+ * memory.
  */
 __device__ __forceinline__ int SharedMemoryIndex(int index) {
   return (threadIdx.y + index) * blockDim.x + threadIdx.x;
@@ -83,7 +83,7 @@ __device__ __forceinline__ T WarpReduce(T val, ReduceOp reducer) {
  */
 
 /**
- * @brief BlockXReduce reduce along blockDim.x
+ * @brief BlockXReduce reduce along blockDim.x.
  */
 template <typename T, typename ReduceOp>
 __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
@@ -115,7 +115,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
 }
 
 /**
- * @brief BlockYReduce reduce along blockDim.y
+ * @brief BlockYReduce reduce along blockDim.y.
  */
 template <typename T, typename ReduceOp>
 __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
@@ -135,24 +135,33 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
 }  // namespace details
 
 /**
- * @brief unary function
- * @param
- * T: data type of in
- * OutT: data type of out
- * NX: the cols of in
- * NY: the rows of in
- * BlockSize: the config of this device
- * OpFunc: compute functor which have an operator() as following
- *     template <typename T, typename OutT>
+ * @brief Perform unary calculation according to OpFunc. Size of input and
+ * output are the same.
+ *
+ * @template paraments
+ * InT: Data type of in.
+ * OutT: Data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT, typename OutT>
  *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const T& a) const {
+ *       HOSTDEVICE OutT operator()(const InT& a) const {
  *         return ...;
  *       }
  *     };
+ *
+ * @param：
+ * out: The register pointer of out, the size is NX * NY.
+ * in: The register pointer of in, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename T, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in,
+__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
                                                  OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; idx++) {
@@ -161,25 +170,35 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in,
 }
 
 /**
- * @brief binary function, in1 and in2 have same shape
- * @param
- * T: data type of in1, in2
- * OutT: data type of out
- * NX: the cols of in1, in2
- * NY: the rows of in1, in2
- * BlockSize: the config of this device
- * OpFunc: compute functor which have an operator() as following
- *     template <typename T, typename OutT>
+ * @brief Binary calculation according to OpFunc. Size of The input and output
+ * are the same.
+ *
+ * @template paraments
+ * InT: Data type of in1 and in2.
+ * OutT: Data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT, typename OutT>
  *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const T& a, const T& b) const {
+ *       HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
  *         return ...;
  *       }
  *     };
+ *
+ * @param：
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * NY.
+ * in2: The register pointer of second input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename T, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
-                                                  const T* in2,
+__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
+                                                  const InT* in2,
                                                   OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
@@ -188,25 +207,38 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
 }
 
 /**
- * @brief ternary function, in1, in2 and in3 have same shape
- * @param
- * T: data type of in1, in2, in3
- * OutT: data type of out
- * NX: the cols of in1, in2
- * NY: the rows of in1, in2
- * BlockSize: the config of this device
- * OpFunc: compute functor which have an operator() as following
- *     template <typename T, typename OutT>
+ * @brief Ternary calculation according to OpFunc. Size of input and output
+ * are the same.
+ *
+ * @template paraments
+ * InT: Data type of in1 and in2.
+ * OutT: Data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT, typename OutT>
  *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const T& a, const T& b, const T& c) const {
+ *       HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c)
+ * const {
  *         return ...;
  *       }
  *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * NY.
+ * in2: The register pointer of second input, size is NX * NY.
+ * in3: The register pointer of third input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename T, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1,
-                                                   const T* in2, const T* in3,
+__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
+                                                   const InT* in2,
+                                                   const InT* in3,
                                                    OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
@@ -215,27 +247,36 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1,
 }
 
 /**
- * @brief a general function for elementwise computation, all inputs have
- *        the same shape.
- * @param
- * T: data type of in1, in2, in3
- * OutT: data type of out
- * NX: the cols of in1, in2
- * NY: the rows of in1, in2
- * BlockSize: the config of this device
- * OpFunc: compute functor which have an operator() as following
- *     template <typename T, typename OutT>
+ * @brief Multivariate calculation according to OpFunc. Size of input and output
+ * are the same.
+ *
+ * @template paraments
+ * InT: Data type of in1, in2 and in3.
+ * OutT: Data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * Arity: The size of ins
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT, typename OutT>
  *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const T* args) const {
+ *       HOSTDEVICE OutT operator()(const InT* args) const {
  *         return ...;
  *       }
  *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * ins: An array of pointers consisting of multiple inputs.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename T, typename OutT, int NX, int NY, int BlockSize, int Arity,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY],
+__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
                                                OpFunc compute) {
-  T args[Arity];
+  InT args[Arity];
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
 #pragma unroll
@@ -247,20 +288,36 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY],
 }
 
 /**
- * @brief cycle binary function, in1's shape size is [1, NX], in2's shape size
- * is [NY, NX], out's shape size is [NY, NX]
+ * @brief Binary calculation according to OpFunc. Shape of in1 and in2 are the
+ * different. Shape of in1 is [1, NX], but in2's shape is [NY, NX], the output
+ * shape is [NY, NX].
+ *
+ * @template paraments
+ * InT: Data type of in1 and in2.
+ * OutT: Data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT, typename OutT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ *
  * @param
- * T: data type of in1, in2
- * OutT: data type of out
- * NX: the cols of in1, in2
- * NY: the rows of in1, in2
- * BlockSize: the config of this device
- * OpFunc: compute functor eg: in1 + in2, in1 - in2
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * 1.
+ * in2: The register pointer of second input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
  */
-template <typename T, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
-                                            const T* in2, OpFunc compute) {
+__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
+                                            const InT* in2, OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX; idx++) {
 #pragma unroll
@@ -272,26 +329,37 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
 }
 
 /**
- * @brief reduce function, in's shape size is [NX, NY].
- * If ReduceMode == kLocalMode then reduce NX, the shape of out is [NY, 1],
- * if ReduceMode == kGlobalMode then reduce between different threads, the
- * shape of out is [NY, NX]. If reduce_last_dim is false and reduce_num was
- * split, BlockYReduce will be called. If reduce_last_dim is true and
- * reduce_num was split, BlockXReduce will be called
- * @typename
- * T: data type of in
- * NX: the cols of in
- * NY: the rows of in
- * BlockSize: the config of this device
- * OpFunc: reduce functor, eg: CustomSum, CustomMean in reduce_functor_op.h
- * @param:
- * reducer: reduce functor, eg: CustomSum<T>()
- * reduce_last_dim: if in's last dim need to be reduce then reduce_last_dim =
- * true
+ * @brief The Reduce provides collective methods for computing a parallel
+ * reduction of items partitioned across a CUDA block and intra thread. When
+ * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode ==
+ * kGlobalMode, use shared memory to reduce between threads.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * ReduceFunctor: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct ReduceFunctor {
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ * ReduceMode: Reduce mode, can be kLocalMode, kGlobalMode.
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in: The register pointer of in, the size is NX * NY.
+ * reducer: Compute function which was declared like ReduceFunctor<InT>().
+ * reduce_last_dim: if the last dim gets involved in reduction.
  */
-template <typename T, int NX, int NY, int BlockSize, class OpFunc,
+template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
           details::ReduceMode Mode>
-__device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
+__device__ __forceinline__ void Reduce(T* out, const T* in,
+                                       ReduceFunctor reducer,
                                        bool reduce_last_dim) {
   int block_index = blockDim.y;
 
@@ -302,7 +370,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
     if (block_reduce_y) {
 #pragma unroll
       for (int i = 0; i < NY * NX; i++) {  // reduce along blockdim.y
-        out[i] = details::BlockYReduce<T, OpFunc>(out[i], reducer);
+        out[i] = details::BlockYReduce<T, ReduceFunctor>(out[i], reducer);
       }
     }
 
@@ -310,7 +378,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
     if (reduce_last_dim) {
 #pragma unroll
       for (int i = 0; i < NY * NX; i++) {  // reduce along blockDim.x
-        out[i] = details::BlockXReduce<T, OpFunc>(out[i], reducer);
+        out[i] = details::BlockXReduce<T, ReduceFunctor>(out[i], reducer);
       }
     }
   } else {  // else  kLocalMode
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
index 3932ba1502ecb..c720bedf0a3af 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
@@ -32,7 +32,13 @@ template <typename T, int VecSize>
 struct alignas(sizeof(T) * VecSize) VectorType {
   T val[VecSize];
 };
-
+/**
+ * Fast division : Replace division in CUDA with multiplication to improve
+ * kernel performance.
+ * 1. Complete the division calculation on the CPU, and record the calculation
+ * results by using the divider and shift_val.
+ * 2. Set the divisor on the GPU through Div() to complete the calculation.
+ */
 struct FastDivMod {
   // 1st value represents the result of input number divides by recorded divisor
   // 2nd value represents the result of input number modulo by recorded divisor
@@ -71,6 +77,11 @@ struct FastDivMod {
   uint32_t multiplier;
 };
 
+/**
+ * Configuration of broadcast. Calculate the input data index according to the
+ * index of the output data. if input or output shape is [dim0, dim1] then dims
+ * must be [dim1, dim0].
+ */
 template <int kDims>
 struct BroadcastConfig {
   FastDivMod divmoders[kDims];
@@ -107,65 +118,31 @@ struct BroadcastConfig {
 }  // namespace details
 
 /**
- * @brief load data from src to dst, src can be 1D data or 2D data. Note that
- * you can use this function when you are sure that the data will not cross the
- * boundary.
- * @typename:
- * Tx: data type of src
- * Ty: data type of dstt
- * NX: the cols of src, dst
- * NY: the rows of src, dst
- * BlockSize: the config of this device
- * @param：
- * stride_nx: the stride of cols
- * stride_ny: the stride of rows
- */
-
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize>
-__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
-                                         int stride_nx, int stride_ny) {
-  int thread_offset = threadIdx.x * NX;
-
-  if (NY == 1 && NX == 1) {
-    dst[0] = static_cast<Ty>(src[thread_offset]);
-  } else if (NX == 1) {
-#pragma unroll
-    for (int idy = 0; idy < NY; ++idy) {
-      dst[idy] = static_cast<Ty>(src[thread_offset + idy * stride_ny]);
-    }
-  } else if (NY == 1) {
-#pragma unroll
-    for (int idx = 0; idx < NX; ++idx) {
-      dst[idx] = static_cast<Ty>(src[thread_offset + idx * stride_nx]);
-    }
-  } else {
-#pragma unroll
-    for (int idx = 0; idx < NX; ++idx) {
-#pragma unroll
-      for (int idy = 0; idy < NY; ++idy) {
-        dst[idy * NX + idx] = static_cast<Ty>(
-            src[thread_offset + idx * stride_nx + idy * stride_ny]);
-      }
-    }
-  }
-}
-
-/**
- * @brief load data from src to dst with stride, src can be 1D data or 2D data.
- * When boundary judgment is required, you need to set a to true, and a is false
- * by default.
- * @typename:
- * Tx: data type of src
- * Ty: data type of dstt
- * NX: the cols of src, dst
- * NY: the rows of src, dst
- * BlockSize: the config of this device
- * IsBoundary: whether to make boundary judgment
+ * @brief Read 2D data from global memory to registers according to Tx type, and
+ * store it as Ty type.
+ *
+ * @template paraments
+ * Tx: The type of data stored in the global memory.
+ * Ty: The type of data that needs to be stored in registers.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
  * @param：
- * size_nx: number of columns to be processed by the current block
- * size_ny: number of rows to be processed by the current block
- * stride_nx: the stride of cols
- * stride_ny: the stride of rows
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: Data pointer of the current block.
+ * size_nx: The current block needs to load size_nx columns of data, this
+ * parameter will be used when IsBoundary = true.
+ * size_ny: The current block needs to load size_ny rows of data. This parameter
+ * will be used when IsBoundary = true.
+ * stride_nx: The stride of cols.
+ * stride_ny: The stride of rows.
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
@@ -226,6 +203,17 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
   }
 }
 
+/**
+ * @brief Initialize register with init_data.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: Initial value.
+ */
 template <typename T, int NX>
 __device__ __forceinline__ void Init(T* dst, T init_data) {
 #pragma unroll
@@ -234,18 +222,27 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
   }
 }
 
-/** @brief: ReadData
- * @brief load data from src to dst, src can be 1D data, you should set NY = 1.
- * When boundary judgment is required, you need to set a to true, and a is false
- * by default.
- * @typename:
- * T : the data type of src
- * NX: the cols of src, dst
- * NY: in this function NY only can be 1
- * BlockSize: the config of this device
- * IsBoundary: whether to make boundary judgment
+/**
+ * @brief Read 2D data from global memory to registers. When IsBoundary = true
+ * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
+ * improve memory access efficiency.
+ *
+ * @template paraments
+ * T: Data type of src and dst.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
  * @param：
- * num: number of columns to be processed by the current block
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: Data pointer of the current block.
+ * size: The current block needs to load size data continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
@@ -279,28 +276,38 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
 }
 
 /**
- * @brief: read data for broadcast
- * @typename:
- * T : the data type of src
- * NX: the cols of src, dst
- * NY: in this function NY only can be 1
- * BlockSize: the config of this device
- * ShapeSize: the shape size of out. eg in[1, 35], out[32, 35] then shape size
- * is 2
- * IsBoundary: whether to make boundary judgment
+ * @brief Read 2D data from global memory to registers for broadcast.
+ *
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
  * @param：
- * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
- * config: get the global index in src, attention config was declared in host;
- * total_num_output: total num of output
- * stride_nx: the stride of cols
- * stride_ny: the stride of rows
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: Raw input data pointer of kernel.
+ * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data. Please
+ * refer to the sample code for specific usage.
+ * total_num_output: Total number of original output.
+ * stride_nx: The stride of cols.
+ * stride_ny: The stride of rows.
  */
-template <typename T, int NX, int NY, int BlockSize, int ShapeSize,
+template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
     T* dst, const T* __restrict__ src, uint32_t block_offset,
-    details::BroadcastConfig<ShapeSize> config, int total_num_output,
-    int stride_nx, int stride_ny) {
+    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
+    int stride_ny) {
   uint32_t thread_offset = block_offset + threadIdx.x * NX;
   uint32_t index_src = 0;
 
@@ -316,7 +323,7 @@ __device__ __forceinline__ void ReadDataBc(
         }
       }
 #pragma unroll
-      for (int i = 0; i < ShapeSize; ++i) {
+      for (int i = 0; i < Rank; ++i) {
         auto fast_divmoder = config.divmoders[i].Divmod(index_output);
         index_output = fast_divmoder.val[0];
         index_src += fast_divmoder.val[1] * config.strides[i];
@@ -327,27 +334,41 @@ __device__ __forceinline__ void ReadDataBc(
 }
 
 /**
- * @brief: read data for broadcast
- * @typename:
- * T : the data type of src
- * NX: the cols of src, dst
- * NY: in this function NY only can be 1
- * BlockSize: the config of this device
- * ShapeSize: the shape size of out. eg in[1, 35], out[32, 35] then shape size
- * is 2
- * IndexCal: get the global index in src, attention config was declared in host;
- * IsBoundary: whether to make boundary judgment
+ * @brief Read 2D data from global memory to registers for reduce.
+ *
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
  * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: Raw input data pointer of kernel.
+ * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
+ * index_cal: Calculation configuration of Reduce. It is used to calculate the
+ * coordinate mapping relationship between output data and input data. Please
+ * refer to the sample code for specific usage.
  * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
  * index_cal: get the global index in src, attention config was declared in
  * host;
- * size_nx: number of columns to be processed by the current block
- * size_ny: number of rows to be processed by the current block
- * stride_nx: the stride of cols
- * stride_ny: the stride of rows
- * reduce_last_dim: according to the block split set threadIdx
+ * size_nx: The current block needs to load size_nx columns of data, this
+ * parameter will be used when IsBoundary = true.
+ * size_ny: The current block needs to load size_ny rows of data. This parameter
+ * will be used when IsBoundary = true.
+ * stride_nx: The stride of cols.
+ * stride_ny: The stride of rows.
+ * reduce_last_dim: Used to indicate whether the dimension of reduce contains
+ * the lowest dimension.
  */
-template <typename T, int NX, int NY, int BlockSize, int ShapeSize,
+template <typename T, int NX, int NY, int BlockSize, int Rank,
           typename IndexCal, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataReduce(
     T* dst, const T* __restrict__ src, int block_offset,
@@ -397,17 +418,26 @@ __device__ __forceinline__ void ReadDataReduce(
 }
 
 /**
- * @brief store data from src to dst, src can be 1D data, you should set NY = 1.
- * When boundary judgment is required, you need to set a to true, and a is false
- * by default.
- * @typename:
- * T : the data type of src
- * NX: the cols of src, dst
- * NY: in this function NY only can be 1
- * BlockSize: the config of this device
- * IsBoundary: whether to make boundary judgment
+ * @brief Write 2D data from registers to global memory. When IsBoundary = true
+ * and (NX % 4 == 0 or Nx % 2 == 0), the data will be vectorized to improve the
+ * data loading efficiency
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
  * @param：
- * num: number of columns to be processed by the current block
+ * dst: Data pointer of the current block.
+ * src: The register pointer of the thread, the size is NX * NY.
+ * size: The current block needs to load size data continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
index 760d542505ec1..6b509eb64cce6 100644
--- a/paddle/fluid/operators/label_smooth_op.h
+++ b/paddle/fluid/operators/label_smooth_op.h
@@ -29,20 +29,21 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
     auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
     auto label_dim = in_t->dims()[in_t->dims().size() - 1];
     out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto in = framework::EigenVector<T>::Flatten(*in_t);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    if (dist_t) {
-      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
-      out.device(dev) =
-          static_cast<T>(1 - epsilon) * in +
-          static_cast<T>(epsilon) *
-              dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel() / label_dim));
-    } else {
-      out.device(dev) = static_cast<T>(1 - epsilon) * in +
-                        static_cast<T>(epsilon / label_dim);
+    if (label_dim != 0) {
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto out = framework::EigenVector<T>::Flatten(*out_t);
+      auto in = framework::EigenVector<T>::Flatten(*in_t);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      if (dist_t) {
+        auto dist = framework::EigenVector<T>::Flatten(*dist_t);
+        out.device(dev) = static_cast<T>(1 - epsilon) * in +
+                          static_cast<T>(epsilon) *
+                              dist.broadcast(Eigen::DSizes<int, 1>(
+                                  in_t->numel() / label_dim));
+      } else {
+        out.device(dev) = static_cast<T>(1 - epsilon) * in +
+                          static_cast<T>(epsilon / label_dim);
+      }
     }
   }
 };
@@ -54,13 +55,15 @@ class LabelSmoothGradKernel : public framework::OpKernel<T> {
     auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     d_in_t->mutable_data<T>(ctx.GetPlace());
+    auto d_out_dim = d_out_t->dims()[d_out_t->dims().size() - 1];
+    if (d_out_dim != 0) {
+      auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+      auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
 
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 06c1eaf881626..4280c86ca99ab 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -35,7 +35,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index 44ba1e4e497bf..8b7f126808134 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -30,6 +30,8 @@ using paddle::inference::lite::CreateTensor;
 using paddle::inference::lite::serialize_params;
 namespace paddle {
 namespace operators {
+
+#if defined(PADDLE_WITH_CUDA)
 TEST(LiteEngineOp, engine_op) {
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->mutable_blocks(0);
@@ -75,8 +77,8 @@ TEST(LiteEngineOp, engine_op) {
   platform::CPUDeviceContext ctx(place);
 #endif
   // Prepare variables.
-  CreateTensor(&scope, "x", std::vector<int64_t>({2, 4}), false);
-  CreateTensor(&scope, "y", std::vector<int64_t>({2, 4}), false);
+  CreateTensor(&scope, "x", std::vector<int64_t>({2, 4}), true);
+  CreateTensor(&scope, "y", std::vector<int64_t>({2, 4}), true);
   CreateTensor(&scope, "out", std::vector<int64_t>({2, 4}), false);
 
   ASSERT_EQ(block_->ops_size(), 4);
@@ -113,5 +115,7 @@ TEST(LiteEngineOp, engine_op) {
   engine_op->Run(scope, place);
   LOG(INFO) << "done";
 }
+#endif
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 25cea2a6711c3..6177ec749ac03 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -78,6 +78,7 @@ else()
     math_library(beam_search DEPS math_function)
 endif()
 math_library(fc DEPS blas)
+math_library(lapack_function DEPS dynload_lapack)
 
 math_library(matrix_bit_code)
 
@@ -88,6 +89,7 @@ math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
 math_library(matrix_inverse)
 math_library(segment_pooling)
+math_library(matrix_solve)
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index bbf7516c538fc..6546f854df0f4 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -247,6 +247,12 @@ class Blas {
   template <typename T>
   void BatchedMatInv(int n, const T** a, T** a_inv, int* info,
                      int batch_size) const;
+
+  // cuBlas solve
+  template <typename T>
+  void BatchedGETRS(CBLAS_TRANSPOSE trans, int n, int nrhs, const T** a,
+                    int lda, int* ipiv, T** b, int ldb, int* info,
+                    int batch_size) const;
 #endif
 
  private:
@@ -402,6 +408,12 @@ class BlasT : private Blas<DeviceContext> {
   void BatchedMatInv(ARGS... args) const {
     Base()->template BatchedMatInv<T>(args...);
   }
+
+  // solve
+  template <typename... ARGS>
+  void BatchedGETRS(ARGS... args) const {
+    Base()->template BatchedGETRS<T>(args...);
+  }
 #endif
 
  private:
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 477f3e0f6a2dc..6f83faf1e40d8 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -114,6 +114,12 @@ struct CUBlas<float> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasSmatinvBatched(args...));
   }
+
+  template <typename... ARGS>
+  static void GETRS_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasSgetrsBatched(args...));
+  }
 };
 
 template <>
@@ -182,6 +188,12 @@ struct CUBlas<double> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasDmatinvBatched(args...));
   }
+
+  template <typename... ARGS>
+  static void GETRS_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasDgetrsBatched(args...));
+  }
 };
 
 template <>
@@ -871,6 +883,20 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
   });
 }
 
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedGETRS(
+    CBLAS_TRANSPOSE trans, int n, int nrhs, const T **a, int lda, int *ipiv,
+    T **b, int ldb, int *info, int batch_size) const {
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTrans =
+      (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
+                           batch_size);
+  });
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index 788ebc6ad985c..1ce5bac5242ab 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -717,6 +717,19 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
   });
 }
 
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedGETRS(
+    CBLAS_TRANSPOSE trans, int n, int nrhs, const T **a, int lda, int *ipiv,
+    T **b, int ldb, int *info, int batch_size) const {
+  rocblas_operation cuTrans = (trans == CblasNoTrans)
+                                  ? rocblas_operation_none
+                                  : rocblas_operation_transpose;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
+                           batch_size);
+  });
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/fluid/operators/math/complex_functors.h
index c4bd6ec4f14a2..3214adb095376 100644
--- a/paddle/fluid/operators/math/complex_functors.h
+++ b/paddle/fluid/operators/math/complex_functors.h
@@ -313,6 +313,29 @@ struct ImagToComplexFunctor<T, Complex<T, Real<T>>> {
   int64_t numel_;
 };
 
+template <typename T, typename Enable = void>
+struct RealImagToComplexFunctor;
+
+template <typename T>
+struct RealImagToComplexFunctor<T, Complex<T, Real<T>>> {
+  RealImagToComplexFunctor(const Real<T>* input_real, const Real<T>* input_imag,
+                           T* output, int64_t numel)
+      : input_real_(input_real),
+        input_imag_(input_imag),
+        output_(output),
+        numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx].real = input_real_[idx];
+    output_[idx].imag = input_imag_[idx];
+  }
+
+  const Real<T>* input_real_;
+  const Real<T>* input_imag_;
+  T* output_;
+  int64_t numel_;
+};
+
 template <typename T, typename Enable = void>
 struct ConjFunctor;
 
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 4e2d180e33628..01f05530e34e6 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "Eigen/Core"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cusolver.h"
@@ -26,88 +25,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using InputMatrixMap = Eigen::Map<
-    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using OutputMatrixMap = Eigen::Map<
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-template <typename ValueType>
-inline void ComputeFloatEigenvaluesAndVectors(ValueType *x_data,
-                                              ValueType *eigenvalues_data,
-                                              ValueType *eigenvectors_data,
-                                              int batches, int rows, int cols,
-                                              bool has_vectors) {
-  int stride = rows * cols;
-  for (int i = 0; i < batches; i++) {
-    auto m = InputMatrixMap<ValueType>(x_data + i * stride, rows, cols);
-    auto eigenvalues =
-        OutputMatrixMap<ValueType>(eigenvalues_data + i * rows, 1, rows);
-    auto eigenvectors =
-        OutputMatrixMap<ValueType>(eigenvectors_data + i * stride, rows, cols);
-
-    Eigen::SelfAdjointEigenSolver<Eigen::Matrix<
-        ValueType, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-        eigen_solver(m, has_vectors ? Eigen::ComputeEigenvectors
-                                    : Eigen::EigenvaluesOnly);
-    PADDLE_ENFORCE_EQ(
-        eigen_solver.info(), Eigen::Success,
-        platform::errors::InvalidArgument(
-            "Self Adjoint Eigen decomposition is not successful. "
-            "The %d-th input matrice might not be not be positive definite.",
-            i));
-
-    eigenvalues = eigen_solver.eigenvalues().transpose();
-    if (has_vectors) {
-      eigenvectors = eigen_solver.eigenvectors().transpose();
-    }
-  }
-}
-
-template <typename T, typename ValueType>
-inline void ComputeComplexEigenvaluesAndVectors(T *x_data,
-                                                ValueType *eigenvalues_data,
-                                                T *eigenvectors_data,
-                                                int batches, int rows, int cols,
-                                                bool has_vectors) {
-  using Complex = std::complex<ValueType>;
-  Complex *input = reinterpret_cast<Complex *>(x_data);
-  Complex *eigenvectors_data_ = reinterpret_cast<Complex *>(eigenvectors_data);
-
-  int stride = rows * cols;
-  for (int i = 0; i < batches; i++) {
-    auto m = InputMatrixMap<Complex>(input + i * stride, rows, cols);
-    auto eigenvalues =
-        OutputMatrixMap<ValueType>(eigenvalues_data + i * rows, 1, rows);
-    auto eigenvectors =
-        OutputMatrixMap<Complex>(eigenvectors_data_ + i * stride, rows, cols);
-
-    Eigen::SelfAdjointEigenSolver<
-        Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-        eigen_solver(m, has_vectors ? Eigen::ComputeEigenvectors
-                                    : Eigen::EigenvaluesOnly);
-    PADDLE_ENFORCE_EQ(
-        eigen_solver.info(), Eigen::Success,
-        platform::errors::InvalidArgument(
-            "Self Adjoint Eigen decomposition is not successful. "
-            "The %d-th input matrice might not be not be positive definite.",
-            i));
-
-    eigenvalues = eigen_solver.eigenvalues().transpose();
-    if (has_vectors) {
-      eigenvectors = eigen_solver.eigenvectors().transpose();
-    }
-  }
-}
-
 inline int64_t GetBatchSize(framework::DDim dims) {
   int64_t batch_size = 1;
   auto dim_size = dims.size();
@@ -117,48 +34,107 @@ inline int64_t GetBatchSize(framework::DDim dims) {
   return batch_size;
 }
 
+static void CheckEighResult(const int batch, const int info) {
+  PADDLE_ENFORCE_LE(
+      info, 0,
+      platform::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] off-diagonal elements of an intermediate"
+          "tridiagonal form did not converge to zero",
+          batch, info));
+  PADDLE_ENFORCE_GE(
+      info, 0, platform::errors::PreconditionNotMet(
+                   "For batch [%d]: the [%d] argument had an illegal value",
+                   batch, info));
+}
+
+template <typename DeviceContext, typename T>
+struct MatrixEighFunctor {
+  void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
+                  Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
+                  bool has_vectors);
+};
+
 // Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
 // symmetric matrices, and uses the variable has_vectors to
 // control whether to return the eigenvectors.
-template <typename DeviceContext, typename ValueType, typename T>
-struct MatrixEighFunctorCPU {
+template <typename T>
+struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    auto dims = input.dims();
-    auto output_value_dim = eigen_values->dims();
+    using ValueType = math::Real<T>;
+    auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
+
+    auto dito =
+        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
+            ctx);
+
+    Tensor input_trans;
+    // lapack is a column-major storge, transpose make the input to
+    // have a continuous memory layout
+    input_trans = dito.Transpose(input);
+    auto *input_vector = input_trans.data<T>();
 
-    int64_t batch_size = 1;
+    auto dims = input.dims();
     int dim_size = dims.size();
-    for (int64_t i = 0; i < dim_size - 2; i++) {
-      batch_size *= dims[i];
-    }
-    auto dito = DeviceIndependenceTensorOperations<DeviceContext, T>(ctx);
-    Tensor input_tensor;
-    TensorCopy(input, ctx.GetPlace(), &input_tensor);
-    if (!is_lower) {
-      input_tensor = dito.Transpose(input);
-    }
-    int rows = dims[dims.size() - 2];
+    int64_t batch_size = GetBatchSize(dims);
 
-    auto *value_data =
-        eigen_values->mutable_data<ValueType>(output_value_dim, ctx.GetPlace());
+    int vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    int values_stride = dims[dim_size - 1];
+    char uplo = is_lower ? 'L' : 'U';
+    char jobz = has_vectors ? 'V' : 'N';
+    auto n = dims[dim_size - 1];
+    auto lda = std::max<int64_t>(1, n);
+    // if work = -1, it means that you need to use the lapack function to query
+    // the optimal value
+    int lwork = -1;      // The length of the array work
+    int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
+    int liwork = -1;     // The dimension of the array iwork
+    int iwork_opt = -1;  // The optimal length of the array liwork
+    T lwork_opt = static_cast<T>(-1);  // The optimal length of the array work
+    ValueType rwork_opt =
+        static_cast<ValueType>(-1);  // The optimal length of the array rwork
+
+    int info = 0;
+    // Call lapackEigh to get the optimal size of work data
+    math::lapackEigh<T, ValueType>(jobz, uplo, n, input_vector, lda, out_value,
+                                   &lwork_opt, lwork, &rwork_opt, lrwork,
+                                   &iwork_opt, liwork, &info);
+    lwork = std::max<int>(1, static_cast<int>(lwork_opt));
+    liwork = std::max<int>(1, iwork_opt);
+
+    Tensor rwork_tensor;
+    ValueType *rwork_data = nullptr;
+
+    // complex type
+    if (framework::IsComplexType(input.type())) {
+      lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
+      rwork_data = rwork_tensor.mutable_data<ValueType>(
+          framework::make_ddim({lrwork}), ctx.GetPlace());
+    }
+    Tensor iwork_tensor, work_tensor;
+    auto *iwork_data = iwork_tensor.mutable_data<int>(
+        framework::make_ddim({liwork}), ctx.GetPlace());
+    auto *work_data = work_tensor.mutable_data<T>(framework::make_ddim({lwork}),
+                                                  ctx.GetPlace());
 
-    if (framework::IsComplexType(input_tensor.type())) {
-      auto *x_data = input_tensor.data<T>();
-      auto *vector_data = eigen_vectors->mutable_data<T>(dims, ctx.GetPlace());
-      ComputeComplexEigenvaluesAndVectors<T, ValueType>(
-          x_data, value_data, vector_data, batch_size, rows, rows, has_vectors);
-    } else {
-      auto *x_data = input_tensor.data<ValueType>();
-      auto *vector_data =
-          eigen_vectors->mutable_data<ValueType>(dims, ctx.GetPlace());
-      ComputeFloatEigenvaluesAndVectors<ValueType>(
-          x_data, value_data, vector_data, batch_size, rows, rows, has_vectors);
+    for (auto i = 0; i < batch_size; i++) {
+      auto *value_data = out_value + i * values_stride;
+      auto *input_data = input_vector + i * vector_stride;
+      math::lapackEigh<T, Real<T>>(jobz, uplo, n, input_data, lda, value_data,
+                                   work_data, lwork, rwork_data, lrwork,
+                                   iwork_data, liwork, &info);
+      CheckEighResult(i, info);
     }
     if (has_vectors) {
-      *eigen_vectors = dito.Transpose(*eigen_vectors);
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              platform::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated, "
+                                  "so the eigenvectors must be provided."));
+      input_trans = dito.Transpose(input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
     }
   }
 };
@@ -168,15 +144,22 @@ struct MatrixEighFunctorCPU {
 // Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
 // symmetric matrices on GPU, and uses the variable has_vectors
 // to control whether to return the eigenvectors.
-template <typename ValueType, typename T>
-struct MatrixEighFunctor {
+template <typename T>
+struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
+    using ValueType = math::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
-    auto *out_vector = eigen_vectors->mutable_data<T>(ctx.GetPlace());
 
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto dito =
+        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
+                                                 T>(ctx);
+    Tensor input_trans;
+    input_trans = dito.Transpose(input);
+    auto *input_vector = input_trans.data<T>();
     auto &dims = input.dims();
     int dim_size = dims.size();
     int64_t batch_size = GetBatchSize(dims);
@@ -190,14 +173,6 @@ struct MatrixEighFunctor {
     int lda = std::max<int>(1, n);
     auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
     auto values_stride = dims[dim_size - 1];
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto dito =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 T>(ctx);
-    Tensor output_v_var_trans = dito.Transpose(input);
-    TensorCopy(output_v_var_trans, ctx.GetPlace(), eigen_vectors);
-
     int lwork = 0;
     auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_size);
     auto *info_ptr = reinterpret_cast<int *>(info->ptr());
@@ -205,10 +180,8 @@ struct MatrixEighFunctor {
     // When the input type is float32, and the feature value input dimension is
     // greater than or equal to [*,32,32]  and less than or equal to
     // [*,512,512], Syevj has better performance.
-    bool use_syevj =
-        (eigen_vectors->type() == framework::proto::VarType::FP32 &&
-         values_stride >= 32 && values_stride <= 512);
-
+    bool use_syevj = (input.type() == framework::proto::VarType::FP32 &&
+                      values_stride >= 32 && values_stride <= 512);
     syevjInfo_t syevj_params;
     if (use_syevj) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -216,52 +189,52 @@ struct MatrixEighFunctor {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cusolverDnSsyevj_bufferSize(
               dev_ctx.cusolver_dn_handle(), jobz, uplo, n,
-              reinterpret_cast<const float *>(out_vector), lda,
+              reinterpret_cast<const float *>(input_vector), lda,
               reinterpret_cast<const float *>(out_value), &lwork,
               syevj_params));
     } else {
-      EvdBuffer(dev_ctx.cusolver_dn_handle(), jobz, uplo, n, out_vector, lda,
+      EvdBuffer(dev_ctx.cusolver_dn_handle(), jobz, uplo, n, input_vector, lda,
                 out_value, &lwork);
     }
-
     auto work = memory::Alloc(dev_ctx, sizeof(T) * lwork);
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
-
     for (auto i = 0; i < batch_size; i++) {
-      auto vector_data = out_vector + i * vector_stride;
-      auto value_data = out_value + i * values_stride;
+      auto *input_data = input_vector + i * vector_stride;
+      auto *value_data = out_value + i * values_stride;
       auto handle = dev_ctx.cusolver_dn_handle();
       if (use_syevj) {
         PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj(
-            handle, jobz, uplo, n, reinterpret_cast<float *>(vector_data), lda,
+            handle, jobz, uplo, n, reinterpret_cast<float *>(input_data), lda,
             reinterpret_cast<float *>(value_data),
             reinterpret_cast<float *>(work_ptr), lwork, info_ptr,
             syevj_params));
       } else {
-        Evd(handle, jobz, uplo, n, vector_data, lda, value_data, work_ptr,
-            lwork, info_ptr);
+        Evd(handle, jobz, uplo, n, input_data, lda, value_data, work_ptr, lwork,
+            info_ptr);
       }
-      int error_info;
+      int error_info = 0;
       memory::Copy(platform::CPUPlace(), &error_info,
                    BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
                    info_ptr, sizeof(int), dev_ctx.stream());
-      PADDLE_ENFORCE_EQ(
-          error_info, 0,
-          platform::errors::PreconditionNotMet(
-              "For batch [%d]: the [%d] argument had an illegal value", i,
-              error_info));
+      CheckEighResult(i, error_info);
     }
 
     if (use_syevj) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cusolverDnDestroySyevjInfo(syevj_params));
     }
-
     if (has_vectors) {
-      *eigen_vectors = dito.Transpose(*eigen_vectors);
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              platform::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated,"
+                                  "so the eigenvectors must be provided."));
+      input_trans = dito.Transpose(input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
     }
   }
 
+  using ValueType = math::Real<T>;
   inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
                         cublasFillMode_t uplo, int n, const T *A, int lda,
                         const ValueType *W, int *lwork) const;
@@ -271,14 +244,14 @@ struct MatrixEighFunctor {
                   T *work, int lwork, int *devInfo) const;
 };
 
-#define FUNC_WITH_TYPES(m)                                       \
-  m(float, float, Ssy, float) m(double, double, Dsy, double)     \
-      m(float, paddle::platform::complex<float>, Che, cuComplex) \
-          m(double, paddle::platform::complex<double>, Zhe, cuDoubleComplex)
+#define FUNC_WITH_TYPES(m)                                \
+  m(float, Ssy, float) m(double, Dsy, double)             \
+      m(paddle::platform::complex<float>, Che, cuComplex) \
+          m(paddle::platform::complex<double>, Zhe, cuDoubleComplex)
 
-#define EVDBUFFER_INSTANCE(ValueType, T, C, CastType)                          \
+#define EVDBUFFER_INSTANCE(T, C, CastType)                                     \
   template <>                                                                  \
-  inline void MatrixEighFunctor<ValueType, T>::EvdBuffer(                      \
+  inline void MatrixEighFunctor<platform::CUDADeviceContext, T>::EvdBuffer(    \
       cusolverDnHandle_t handle, cusolverEigMode_t jobz,                       \
       cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W,   \
       int *lwork) const {                                                      \
@@ -290,9 +263,9 @@ struct MatrixEighFunctor {
 
 FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);
 
-#define EVD_INSTANCE(ValueType, T, C, CastType)                           \
+#define EVD_INSTANCE(T, C, CastType)                                      \
   template <>                                                             \
-  inline void MatrixEighFunctor<ValueType, T>::Evd(                       \
+  inline void MatrixEighFunctor<platform::CUDADeviceContext, T>::Evd(     \
       cusolverDnHandle_t handle, cusolverEigMode_t jobz,                  \
       cublasFillMode_t uplo, int n, T *A, int lda, ValueType *W, T *work, \
       int lwork, int *devInfo) const {                                    \
diff --git a/paddle/fluid/operators/math/lapack_function.cc b/paddle/fluid/operators/math/lapack_function.cc
new file mode 100644
index 0000000000000..3ce2225420e60
--- /dev/null
+++ b/paddle/fluid/operators/math/lapack_function.cc
@@ -0,0 +1,130 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/math/lapack_function.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/dynload/lapack.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// LU (for example)
+template <>
+void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
+  platform::dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
+}
+
+template <>
+void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
+  platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
+}
+
+// eigh
+template <>
+void lapackEigh<float>(char jobz, char uplo, int n, float *a, int lda, float *w,
+                       float *work, int lwork, float *rwork, int lrwork,
+                       int *iwork, int liwork, int *info) {
+  (void)rwork;   // unused
+  (void)lrwork;  // unused
+  platform::dynload::ssyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
+                             &liwork, info);
+}
+
+template <>
+void lapackEigh<double>(char jobz, char uplo, int n, double *a, int lda,
+                        double *w, double *work, int lwork, double *rwork,
+                        int lrwork, int *iwork, int liwork, int *info) {
+  (void)rwork;   // unused
+  (void)lrwork;  // unused
+  platform::dynload::dsyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
+                             &liwork, info);
+}
+
+template <>
+void lapackEigh<platform::complex<float>, float>(
+    char jobz, char uplo, int n, platform::complex<float> *a, int lda, float *w,
+    platform::complex<float> *work, int lwork, float *rwork, int lrwork,
+    int *iwork, int liwork, int *info) {
+  platform::dynload::cheevd_(&jobz, &uplo, &n,
+                             reinterpret_cast<std::complex<float> *>(a), &lda,
+                             w, reinterpret_cast<std::complex<float> *>(work),
+                             &lwork, rwork, &lrwork, iwork, &liwork, info);
+}
+
+template <>
+void lapackEigh<platform::complex<double>, double>(
+    char jobz, char uplo, int n, platform::complex<double> *a, int lda,
+    double *w, platform::complex<double> *work, int lwork, double *rwork,
+    int lrwork, int *iwork, int liwork, int *info) {
+  platform::dynload::zheevd_(&jobz, &uplo, &n,
+                             reinterpret_cast<std::complex<double> *>(a), &lda,
+                             w, reinterpret_cast<std::complex<double> *>(work),
+                             &lwork, rwork, &lrwork, iwork, &liwork, info);
+}
+
+// Eig
+template <>
+void lapackEig<double>(char jobvl, char jobvr, int n, double *a, int lda,
+                       double *w, double *vl, int ldvl, double *vr, int ldvr,
+                       double *work, int lwork, double *rwork, int *info) {
+  double *wr = w;
+  double *wi = w + n;
+  (void)rwork;  // unused
+  platform::dynload::dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr,
+                            &ldvr, work, &lwork, info);
+}
+
+template <>
+void lapackEig<float>(char jobvl, char jobvr, int n, float *a, int lda,
+                      float *w, float *vl, int ldvl, float *vr, int ldvr,
+                      float *work, int lwork, float *rwork, int *info) {
+  float *wr = w;
+  float *wi = w + n;
+  (void)rwork;  // unused
+  platform::dynload::sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr,
+                            &ldvr, work, &lwork, info);
+}
+
+template <>
+void lapackEig<platform::complex<double>, double>(
+    char jobvl, char jobvr, int n, platform::complex<double> *a, int lda,
+    platform::complex<double> *w, platform::complex<double> *vl, int ldvl,
+    platform::complex<double> *vr, int ldvr, platform::complex<double> *work,
+    int lwork, double *rwork, int *info) {
+  platform::dynload::zgeev_(
+      &jobvl, &jobvr, &n, reinterpret_cast<std::complex<double> *>(a), &lda,
+      reinterpret_cast<std::complex<double> *>(w),
+      reinterpret_cast<std::complex<double> *>(vl), &ldvl,
+      reinterpret_cast<std::complex<double> *>(vr), &ldvr,
+      reinterpret_cast<std::complex<double> *>(work), &lwork, rwork, info);
+}
+
+template <>
+void lapackEig<platform::complex<float>, float>(
+    char jobvl, char jobvr, int n, platform::complex<float> *a, int lda,
+    platform::complex<float> *w, platform::complex<float> *vl, int ldvl,
+    platform::complex<float> *vr, int ldvr, platform::complex<float> *work,
+    int lwork, float *rwork, int *info) {
+  platform::dynload::cgeev_(
+      &jobvl, &jobvr, &n, reinterpret_cast<std::complex<float> *>(a), &lda,
+      reinterpret_cast<std::complex<float> *>(w),
+      reinterpret_cast<std::complex<float> *>(vl), &ldvl,
+      reinterpret_cast<std::complex<float> *>(vr), &ldvr,
+      reinterpret_cast<std::complex<float> *>(work), &lwork, rwork, info);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/lapack_function.h b/paddle/fluid/operators/math/lapack_function.h
new file mode 100644
index 0000000000000..a4c2c865c859a
--- /dev/null
+++ b/paddle/fluid/operators/math/lapack_function.h
@@ -0,0 +1,37 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// LU (for example)
+template <typename T>
+void lapackLu(int m, int n, T* a, int lda, int* ipiv, int* info);
+
+template <typename T, typename ValueType = T>
+void lapackEigh(char jobz, char uplo, int n, T* a, int lda, ValueType* w,
+                T* work, int lwork, ValueType* rwork, int lrwork, int* iwork,
+                int liwork, int* info);
+
+template <typename T1, typename T2 = T1>
+void lapackEig(char jobvl, char jobvr, int n, T1* a, int lda, T1* w, T1* vl,
+               int ldvl, T1* vr, int ldvr, T1* work, int lwork, T2* rwork,
+               int* info);
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 46dd58562aebd..cfdfa456e39ea 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -44,6 +44,7 @@ template struct SetConstant<platform::CUDADeviceContext,
                             platform::complex<double>>;
 
 #define DEFINE_GPU_TRANS(RANK)                                            \
+  template struct Transpose<platform::CUDADeviceContext, bool, RANK>;     \
   template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
   template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
   template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc
new file mode 100644
index 0000000000000..7f13b5c8a70ee
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_solve.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/matrix_solve.h"
+#include "Eigen/Core"
+#include "Eigen/LU"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MatrixSolveFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor& a, const framework::Tensor& b,
+                  framework::Tensor* out) {
+    compute_solve_eigen<platform::CPUDeviceContext, T>(dev_ctx, a, b, out);
+  }
+};
+
+template class MatrixSolveFunctor<platform::CPUDeviceContext, float>;
+template class MatrixSolveFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
new file mode 100644
index 0000000000000..efb3a07e4c1b4
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/matrix_solve.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/solve_op.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+class MatrixSolveFunctor;
+
+template <typename T>
+class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& a, const framework::Tensor& b,
+                  framework::Tensor* out) {
+#ifndef PADDLE_WITH_HIP
+
+    // solve the equation: Ax = B,
+    // use cuBlas cublas<S/D>getrfBatched funcion to performs the LU
+    // factorization of each matrix A,
+    // and then use cuBlas cublas<S/D>getriBatched function to solve the
+    // equation after LU factorization.
+    // ref:
+    // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
+    const auto& a_dims = a.dims();
+    const int a_rank = a_dims.size();
+    int n = a_dims[a_rank - 1];
+    int lda = n;
+    int batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;
+
+    const auto& b_dims = b.dims();
+    const int b_rank = b_dims.size();
+    int nrhs = b_dims[b_rank - 1];
+    int ldb = b_dims[b_rank - 2];
+
+    // make sure the out dims is right
+    out->Resize(b_dims);
+    out->mutable_data<T>(context.GetPlace());
+
+    // copy input A to a temporary tensor tmp_a,
+    // LU factorization, written back to original matrix A, so in the beginning,
+    // it's necessary to create a temporary tensor tmp_a.
+    Tensor tmp_a(a.type());
+    tmp_a.Resize(a.dims());
+    tmp_a.mutable_data<T>(context.GetPlace());
+    TensorCopy(a, context.GetPlace(), &tmp_a);
+
+    // copy input B to a temporary tensor tmp_b, and transpose tmp_b,
+    // because cuBlas assumes column-major while Paddle uses row-majar.
+    Tensor tmp_b(b.type());
+    const auto& new_dims_vec = getNewDimsVec(b_dims);
+    tmp_b.Resize(framework::make_ddim(new_dims_vec));
+    tmp_b.mutable_data<T>(context.GetPlace());
+    math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+    std::vector<int> new_axis = getNewAxis(b_rank);
+    trans(context, b, &tmp_b, new_axis);
+
+    const T* a_data_in_gpu = tmp_a.data<T>();
+    const T* b_data_in_gpu = tmp_b.data<T>();
+
+    std::vector<const T*> cpu_ptrs(batch_size * 2);
+    for (int i = 0; i < batch_size; ++i) {
+      cpu_ptrs[i] = a_data_in_gpu + i * n * n;
+      cpu_ptrs[i + batch_size] = b_data_in_gpu + i * n * nrhs;
+    }
+
+    // Copy the addresses of A and tmp_b from host to device.
+    memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+        memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                 tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(),
+                 static_cast<void*>(cpu_ptrs.data()),
+                 cpu_ptrs.size() * sizeof(T*), context.stream());
+
+    T** gpu_tmp_b_ptrs =
+        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+
+    // Allocate device memory for BatchedGETRF's info and pivots.
+    int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+    memory::allocation::AllocationPtr tmp_gpu_info_data =
+        memory::Alloc(context, num_ints * sizeof(int));
+    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+
+    // only for singular checking
+    std::vector<int> info;
+    info.resize(batch_size);
+
+    int* gpu_pivot_ptr =
+        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+
+    // This function performs the LU factorization of each matrix A by the
+    // equation A = L * U. L and U are written back to original matrix A,
+    // and diagonal elements of L are discarded.
+    blas.BatchedGETRF(n, reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr, gpu_info_ptr, batch_size);
+
+    // check whether BatchedGETRF is executed successfully or not
+    memory::Copy(platform::CPUPlace(), info.data(),
+                 BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                 gpu_info_ptr, sizeof(int) * batch_size, context.stream());
+    for (int i = 0; i < batch_size; ++i) {
+      PADDLE_ENFORCE_EQ(info[i], 0,
+                        platform::errors::PreconditionNotMet(
+                            "For batch [%d]: U(%d, %d) is zero, singular U. "
+                            "Please check the matrix value and change it to a "
+                            "non-singular matrix",
+                            i, info[i], info[i]));
+    }
+
+    // hold the result code from BatchedGETRS
+    int host_info = 0;
+
+    // to solve the equation after LU factorization
+    CBLAS_TRANSPOSE transA = CblasTrans;
+    blas.BatchedGETRS(
+        transA, n, nrhs, reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+        lda, gpu_pivot_ptr, gpu_tmp_b_ptrs, ldb, &host_info, batch_size);
+
+    // check whether BatchedGETRS is executed successfully or not
+    PADDLE_ENFORCE_EQ(host_info, 0,
+                      platform::errors::InvalidArgument(
+                          "The [%d]'th argument to cublas*getrsBatched had "
+                          "an illegal value.",
+                          -host_info));
+
+    // transpose tmp_b to get the final result in row-major form.
+    math::TransposeNormal<platform::CUDADeviceContext, T> trans2;
+    trans2(context, tmp_b, out, new_axis);
+
+#else
+    compute_solve_eigen<platform::CUDADeviceContext, T>(context, a, b, out);
+#endif
+  }
+};
+
+template class MatrixSolveFunctor<platform::CUDADeviceContext, float>;
+template class MatrixSolveFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h
new file mode 100644
index 0000000000000..93c37ae425640
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_solve.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "Eigen/Core"
+#include "Eigen/LU"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+void compute_solve_eigen(const DeviceContext& context,
+                         const framework::Tensor& a, const framework::Tensor& b,
+                         framework::Tensor* out) {
+  using Matrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using EigenMatrixMap = Eigen::Map<Matrix>;
+  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+  // prepare for a
+  const auto& a_mat_dims = a.dims();
+  const int a_rank = a_mat_dims.size();
+  int n = a_mat_dims[a_rank - 1];
+  int a_batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;
+
+  // prepare for b
+  const auto& b_mat_dims = b.dims();
+  const int b_rank = b_mat_dims.size();
+  int b_h = n;
+  int b_w = b_mat_dims[b_rank - 1];
+  int b_batch_size = b_rank > 2 ? b.numel() / (b_h * b_w) : 1;
+
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  out->Resize(b_mat_dims);  // make sure the out dims is right
+
+  T* out_ptr = out->mutable_data<T>(context.GetPlace());
+  if (a_batch_size == b_batch_size) {
+    for (int i = 0; i < a_batch_size; ++i) {
+      ConstEigenMatrixMap a_mat(a_ptr + i * n * n, n, n);
+      ConstEigenMatrixMap b_mat(b_ptr + i * b_h * b_w, b_h, b_w);
+      EigenMatrixMap out_mat(out_ptr + i * b_h * b_w, b_h, b_w);
+      Eigen::PartialPivLU<Matrix> lu;
+      lu.compute(a_mat);
+      const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+      PADDLE_ENFORCE_GT(
+          min_abs_pivot, static_cast<T>(0),
+          platform::errors::InvalidArgument("Input is not invertible."));
+      out_mat.noalias() = lu.solve(b_mat);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(a_batch_size, b_batch_size,
+                      platform::errors::InvalidArgument(
+                          "All input tensors must have the same rank."));
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MatrixSolveFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& a,
+                  const framework::Tensor& b, framework::Tensor* out);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index dbb3f46f38b88..48b0d2ab46057 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -16,66 +16,141 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/fluid/platform/gpu_launch_config.h"
 
+#ifdef __HIPCC__
+#define POOLING_BLOCK_SIZE 256
+#else
+#define POOLING_BLOCK_SIZE 512
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
 
+struct FastDivModForPooling {
+ public:
+  platform::FastDivMod channel;
+  platform::FastDivMod width;
+  platform::FastDivMod height;
+
+  explicit HOSTDEVICE FastDivModForPooling(const int channels,
+                                           const int output_width,
+                                           const int output_height) {
+    channel = platform::FastDivMod(channels);
+    width = platform::FastDivMod(output_width);
+    height = platform::FastDivMod(output_height);
+  }
+};
+
+struct FastDivModForPoolingWithMoreStaff {
+ public:
+  platform::FastDivMod channel;
+  platform::FastDivMod width;
+  platform::FastDivMod height;
+  platform::FastDivMod ksize_w;
+  platform::FastDivMod ksize_h;
+  platform::FastDivMod stride_w;
+  platform::FastDivMod stride_h;
+
+  explicit HOSTDEVICE FastDivModForPoolingWithMoreStaff(
+      const int channels, const int input_width, const int input_height,
+      const int ksize_width, const int ksize_height, const int stride_width,
+      const int stride_height) {
+    channel = platform::FastDivMod(channels);
+    width = platform::FastDivMod(input_width);
+    height = platform::FastDivMod(input_height);
+    ksize_w = platform::FastDivMod(ksize_width);
+    ksize_h = platform::FastDivMod(ksize_height);
+    stride_w = platform::FastDivMod(stride_width);
+    stride_h = platform::FastDivMod(stride_height);
+  }
+};
+
+template <typename FastDivModForPooling>
+__device__ void OffsetPreparationFor4Dimension(
+    int index, bool channel_last, FastDivModForPooling divmods,
+    const int pad_width, const int pad_height, const int aux_width,
+    const int aux_height, int* w_offset, int* h_offset, int* c_offset,
+    int* stride) {
+  if (!channel_last) { /* NCHW */
+    auto input_width_divmod = divmods.width.Divmod(index);
+    auto input_height_divmod = divmods.height.Divmod(input_width_divmod.val[0]);
+    auto channel_divmod = divmods.channel.Divmod(input_height_divmod.val[0]);
+    *w_offset = input_width_divmod.val[1] + pad_width;
+    *h_offset = input_height_divmod.val[1] + pad_height;
+    *c_offset = channel_divmod.val[1];
+    *stride = (channel_divmod.val[0] * divmods.channel.divisor + *c_offset) *
+              aux_height * aux_width;
+  } else { /* NHWC */
+    auto c_divmod = divmods.channel.Divmod(index);
+    auto input_width_divmod = divmods.width.Divmod(c_divmod.val[0]);
+    auto input_height_divmod = divmods.height.Divmod(input_width_divmod.val[0]);
+    *c_offset = c_divmod.val[1];
+    *w_offset = input_width_divmod.val[1] + pad_width;
+    *h_offset = input_height_divmod.val[1] + pad_height;
+    *stride = input_height_divmod.val[0] * aux_height * aux_width *
+              divmods.channel.divisor;
+  }
+}
+
+int GetThreadsPerBlock(const platform::CUDADeviceContext& ctx,
+                       int threads_per_block, int64_t numel) {
+  int sm_count = ctx.GetSMCount();
+  if (numel / (sm_count << 1) < threads_per_block) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 1));
+  } else if (numel / (sm_count << 2) < threads_per_block) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  return std::max(64, threads_per_block);
+}
+
 template <typename PoolProcess, typename T>
-__global__ void KernelPool2D(const int nthreads, const T* input_data,
-                             const int channels, const int input_height,
-                             const int input_width, const int output_height,
-                             const int output_width, const int ksize_height,
-                             const int ksize_width, const int stride_height,
-                             const int stride_width, const int padding_height,
-                             const int padding_width, PoolProcess pool_process,
-                             bool exclusive, bool adaptive, T* output_data,
-                             bool channel_last = false) {
+__global__ void KernelPool2D(
+    const int nthreads, const T* input_data, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, FastDivModForPooling divmods,
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
+    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int pw, ph, c, batch_idx;
-    if (!channel_last) { /*NCHW*/
-      pw = index % output_width;
-      ph = (index / output_width) % output_height;
-      c = (index / output_width / output_height) % channels;
-      batch_idx = index / output_width / output_height / channels;
-    } else { /*NHWC*/
-      c = index % channels;
-      pw = (index / channels) % output_width;
-      ph = (index / channels / output_width) % output_height;
-      batch_idx = index / channels / output_width / output_height;
-    }
+    int hstart, hend, wstart, wend;
+    int w_offset, h_offset, c_offset, input_offset;
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(
+        index, channel_last, divmods, 0, 0, input_width, input_height,
+        &w_offset, &h_offset, &c_offset, &input_offset);
+    input_data += input_offset;
 
-    int hstart, hend;
-    int wstart, wend;
     if (adaptive) {
-      hstart = AdaptStartIndex(ph, input_height, output_height);
-      hend = AdaptEndIndex(ph, input_height, output_height);
-
-      wstart = AdaptStartIndex(pw, input_width, output_width);
-      wend = AdaptEndIndex(pw, input_width, output_width);
+      hstart = AdaptStartIndex(h_offset, input_height, output_height);
+      hend = AdaptEndIndex(h_offset, input_height, output_height);
+      wstart = AdaptStartIndex(w_offset, input_width, output_width);
+      wend = AdaptEndIndex(w_offset, input_width, output_width);
     } else {
-      hstart = ph * stride_height - padding_height;
+      hstart = h_offset * stride_height - padding_height;
       hend = min(hstart + ksize_height, input_height);
       hstart = max(hstart, 0);
-
-      wstart = pw * stride_width - padding_width;
+      wstart = w_offset * stride_width - padding_width;
       wend = min(wstart + ksize_width, input_width);
       wstart = max(wstart, 0);
     }
 
-    if (!channel_last) {
-      input_data += (batch_idx * channels + c) * input_height * input_width;
-    } else {
-      input_data += batch_idx * input_height * input_width * channels;
-    }
     T ele = pool_process.initial();
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        auto input_idx = channel_last ? (h * input_width + w) * channels + c
-                                      : h * input_width + w;
+        auto input_idx = channel_last
+                             ? (h * input_width + w) * channels + c_offset
+                             : h * input_width + w;
         pool_process.compute(input_data[input_idx], &ele);
       }
     }
@@ -85,91 +160,109 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
     output_data[index] = ele;
   }
 }
-template <typename PoolProcess, typename T>
+
+template <typename T, typename PoolProcess>
 __global__ void KernelPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_height,
-    const int input_width, const int output_height, const int output_width,
-    const int ksize_height, const int ksize_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad,
-    bool channel_last = false) {
+    const int nthreads, const T* __restrict__ input_data,
+    const T* __restrict__ output_data, const const T* __restrict__ output_grad,
+    const int output_width, const int output_height, const int input_width,
+    const int input_height, const int ksize_width, const int ksize_height,
+    const int stride_width, const int stride_height, const int padding_width,
+    const int padding_height, FastDivModForPoolingWithMoreStaff divmods,
+    PoolProcess pool_process, bool exclusive, bool adaptive,
+    T* __restrict__ input_grad, bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int w_offset, h_offset, offsetC, batch_idx;
-    if (!channel_last) { /* NCHW */
-      w_offset = index % input_width + padding_width;
-      h_offset = (index / input_width) % input_height + padding_height;
-      offsetC = (index / input_width / input_height) % channels;
-      batch_idx = index / input_width / input_height / channels;
-    } else { /* NHWC */
-      offsetC = index % channels;
-      w_offset = (index / channels) % input_width + padding_width;
-      h_offset =
-          (index / channels / input_width) % input_height + padding_height;
-      batch_idx = index / channels / input_width / input_height;
+    T input = static_cast<T>(0);
+    T input_grad_data = static_cast<T>(0);
+    int phstart, phend, pwstart, pwend;
+    int w_offset, h_offset, c_offset, output_offset;
+    OffsetPreparationFor4Dimension<>(index, channel_last, divmods,
+                                     padding_width, padding_height,
+                                     output_width, output_height, &w_offset,
+                                     &h_offset, &c_offset, &output_offset);
+    if (pool_process.use_x) {
+      input = input_data[index];
+      output_data += output_offset;
     }
+    output_grad += output_offset;
 
-    int phstart, phend;
-    int pwstart, pwend;
     if (adaptive) {
-      phstart = AdaptStartIndex(h_offset, output_height, input_height);
-      phend = AdaptEndIndex(h_offset, output_height, input_height);
+      auto tmp_phend = divmods.height.Divmod((h_offset + 1) * output_height);
+      auto tmp_pwend = divmods.width.Divmod((w_offset + 1) * output_width);
+      phstart = divmods.height.Div(h_offset * output_height);
+      pwstart = divmods.width.Div(w_offset * output_width);
+      phend = tmp_phend.val[1] > 0 ? tmp_phend.val[0] + 1 : tmp_phend.val[0];
+      pwend = tmp_pwend.val[1] > 0 ? tmp_pwend.val[0] + 1 : tmp_pwend.val[0];
 
-      pwstart = AdaptStartIndex(w_offset, output_width, input_width);
-      pwend = AdaptEndIndex(w_offset, output_width, input_width);
-    } else {
-      phstart = (h_offset < ksize_height)
-                    ? 0
-                    : (h_offset - ksize_height) / stride_height + 1;
-      pwstart = (w_offset < ksize_width)
-                    ? 0
-                    : (w_offset - ksize_width) / stride_width + 1;
-      phend = min(h_offset / stride_height + 1, output_height);
-      pwend = min(w_offset / stride_width + 1, output_width);
-    }
-    T gradient = static_cast<T>(0.0);
-    T input = input_data[index];
-
-    int output_stride;
-    if (!channel_last) {
-      output_stride =
-          (batch_idx * channels + offsetC) * output_height * output_width;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          auto ksize_w_divmod = divmods.ksize_w.Divmod(input_width);
+          auto ksize_h_divmod = divmods.ksize_h.Divmod(input_height);
+          auto tmp_width = ksize_w_divmod.val[1] > 0 ? ksize_w_divmod.val[0] + 1
+                                                     : ksize_w_divmod.val[0];
+          auto tmp_height = ksize_h_divmod.val[1] > 0
+                                ? ksize_h_divmod.val[0] + 1
+                                : ksize_h_divmod.val[0];
+          int pool_size = tmp_height * tmp_width;
+          int tmp_idx = ph * output_width + pw;
+          int output_sub_idx =
+              channel_last ? tmp_idx * divmods.channel.divisor + c_offset
+                           : tmp_idx;
+          T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
+                                             : static_cast<T>(0);
+          pool_process.compute(input, ouput_value, output_grad[output_sub_idx],
+                               static_cast<T>(1.0 / pool_size),
+                               &input_grad_data);
+        }
+      }
     } else {
-      output_stride = batch_idx * output_height * output_width * channels;
-    }
-
-    output_data += output_stride;
-    output_grad += output_stride;
-
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        int pool_size;
-        if (adaptive) {
-          pool_size = static_cast<int>(ceil(static_cast<double>(input_height) /
-                                            ksize_height)) *
-                      static_cast<int>(
-                          ceil(static_cast<double>(input_width) / ksize_width));
-        } else {
-          int hstart = ph * stride_height - padding_height;
-          int wstart = pw * stride_width - padding_width;
-          int hend = min(hstart + ksize_height, input_height);
-          int wend = min(wstart + ksize_width, input_width);
-          hstart = max(hstart, 0);
-          wstart = max(wstart, 0);
-          pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
+      auto stride_height_div = divmods.stride_h.Div(h_offset - ksize_height);
+      auto stride_width_div = divmods.stride_w.Div(w_offset - ksize_width);
+      phstart = (h_offset < ksize_height) ? 0 : stride_height_div + 1;
+      pwstart = (w_offset < ksize_width) ? 0 : stride_width_div + 1;
+      phend = min(divmods.stride_h.Div(h_offset) + 1, output_height);
+      pwend = min(divmods.stride_w.Div(w_offset) + 1, output_width);
+
+      if (exclusive) {
+        for (int ph = phstart; ph < phend; ++ph) {
+          for (int pw = pwstart; pw < pwend; ++pw) {
+            int hstart = ph * stride_height - padding_height;
+            int wstart = pw * stride_width - padding_width;
+            int hend = min(hstart + ksize_height, input_height);
+            int wend = min(wstart + ksize_width, input_width);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            int tmp_idx = ph * output_width + pw;
+            int output_sub_idx =
+                channel_last ? tmp_idx * divmods.channel.divisor + c_offset
+                             : tmp_idx;
+            T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
+                                               : static_cast<T>(0);
+            pool_process.compute(
+                input, ouput_value, output_grad[output_sub_idx],
+                static_cast<T>(1.0 / pool_size), &input_grad_data);
+          }
+        }
+      } else {
+        for (int ph = phstart; ph < phend; ++ph) {
+          for (int pw = pwstart; pw < pwend; ++pw) {
+            int pool_size = ksize_height * ksize_width;
+            int tmp_idx = ph * output_width + pw;
+            int output_sub_idx =
+                channel_last ? tmp_idx * divmods.channel.divisor + c_offset
+                             : tmp_idx;
+            T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
+                                               : static_cast<T>(0);
+            pool_process.compute(
+                input, ouput_value, output_grad[output_sub_idx],
+                static_cast<T>(1.0 / pool_size), &input_grad_data);
+          }
         }
-
-        int output_sub_idx = channel_last
-                                 ? (ph * output_width + pw) * channels + offsetC
-                                 : ph * output_width + pw;
-        pool_process.compute(input, output_data[output_sub_idx],
-                             output_grad[output_sub_idx],
-                             static_cast<T>(1.0 / pool_size), &gradient);
       }
     }
-    input_grad[index] = gradient;
+    input_grad[index] = input_grad_data;
   }
 }
 
@@ -180,45 +273,32 @@ __global__ void KernelMaxPool2DGrad(
     const int input_width, const int output_height, const int output_width,
     const int ksize_height, const int ksize_width, const int stride_height,
     const int stride_width, const int padding_height, const int padding_width,
-    T* input_grad, bool channel_last = false) {
+    T* input_grad, FastDivModForPooling divmods, bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int pw, ph, c, batch_idx;
-    if (!channel_last) { /* NCHW */
-      pw = index % output_width;
-      ph = (index / output_width) % output_height;
-      c = (index / output_width / output_height) % channels;
-      batch_idx = index / output_width / output_height / channels;
-    } else { /* NHWC */
-      c = index % channels;
-      pw = (index / channels) % output_width;
-      ph = (index / channels / output_width) % output_height;
-      batch_idx = index / channels / output_width / output_height;
-    }
-    int hstart = ph * stride_height - padding_height;
+    int w_offset, h_offset, c_offset, input_offset;
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(
+        index, channel_last, divmods, 0, 0, input_width, input_height,
+        &w_offset, &h_offset, &c_offset, &input_offset);
+    input_data += input_offset;
+    input_grad += input_offset;
+
+    int hstart = h_offset * stride_height - padding_height;
     int hend = min(hstart + ksize_height, input_height);
     hstart = max(hstart, 0);
 
-    int wstart = pw * stride_width - padding_width;
+    int wstart = w_offset * stride_width - padding_width;
     int wend = min(wstart + ksize_width, input_width);
     wstart = max(wstart, 0);
 
-    int input_stride;
-    if (!channel_last) {
-      input_stride = (batch_idx * channels + c) * input_height * input_width;
-    } else {
-      input_stride = batch_idx * input_height * input_width * channels;
-    }
-    input_data += input_stride;
-    input_grad += input_stride;
-
     T ele = output_data[index];
     int maxIndex = -1;
     bool stop = false;
     for (int h = hstart; h < hend && !stop; ++h) {
       for (int w = wstart; w < wend && !stop; ++w) {
-        int input_data_idx = channel_last ? (h * input_width + w) * channels + c
-                                          : h * input_width + w;
+        int input_data_idx = channel_last
+                                 ? (h * input_width + w) * channels + c_offset
+                                 : h * input_width + w;
         if (ele == input_data[input_data_idx]) {
           maxIndex = input_data_idx;
           stop = true;
@@ -264,10 +344,13 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   dim3 threads(thread_num, 1);
   dim3 grid(blocks, 1);
 
+  auto pool_divmods =
+      FastDivModForPooling(input_channels, output_width, output_height);
   KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
       nthreads, input, input_channels, input_height, input_width, output_height,
       output_width, ksize_height, ksize_width, stride_height, stride_width,
-      padding_height, padding_width, pool_compute, exclusive, adaptive, output);
+      padding_height, padding_width, pool_divmods, pool_compute, exclusive,
+      adaptive, output);
 }
 
 /*
@@ -311,11 +394,14 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
+
+    auto pool_divmods =
+        FastDivModForPooling(input_channels, output_width, output_height);
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data);
+        stride_width, padding_height, padding_width, pool_divmods, pool_process,
+        exclusive, adaptive, output_data);
   }
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
@@ -357,11 +443,14 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
+
+    auto pool_divmods =
+        FastDivModForPooling(input_channels, output_width, output_height);
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data, channel_last);
+        stride_width, padding_height, padding_width, pool_divmods, pool_process,
+        exclusive, adaptive, output_data, channel_last);
   }
 };
 /*
@@ -402,15 +491,18 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, exclusive, adaptive, input_grad_data);
+    int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
+    int grids = (nthreads + blocks - 1) / blocks;
+
+    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
+        input_channels, input_width, input_height, ksize_width, ksize_height,
+        stride_width, stride_height);
+
+    KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, output_width,
+        output_height, input_width, input_height, ksize_width, ksize_height,
+        stride_width, stride_height, padding_width, padding_height,
+        pool_divmods, pool_process, exclusive, adaptive, input_grad_data);
   }
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -424,7 +516,6 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
-
     const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
     const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
     const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
@@ -447,19 +538,22 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, exclusive, adaptive, input_grad_data, channel_last);
+    int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
+    int grids = (nthreads + blocks - 1) / blocks;
+
+    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
+        input_channels, input_width, input_height, ksize_width, ksize_height,
+        stride_width, stride_height);
+
+    KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, output_width,
+        output_height, input_width, input_height, ksize_width, ksize_height,
+        stride_width, stride_height, padding_width, padding_height,
+        pool_divmods, pool_process, exclusive, adaptive, input_grad_data,
+        channel_last);
   }
 };
 
@@ -505,11 +599,13 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
+    auto pool_divmods =
+        FastDivModForPooling(input_channels, output_width, output_height);
     KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data);
+        input_grad_data, pool_divmods);
   }
   void operator()(
       const platform::CUDADeviceContext& context,
@@ -550,11 +646,14 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
+    auto pool_divmods =
+        FastDivModForPooling(input_channels, output_width, output_height);
+
     KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data, channel_last);
+        input_grad_data, pool_divmods, channel_last);
   }
 };
 
@@ -689,35 +788,40 @@ __global__ void KernelPool3D(
   }
 }
 
-template <typename PoolProcess, typename T>
+template <typename T, typename PoolProcess>
 __global__ void KernelPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_depth,
-    const int input_height, const int input_width, const int output_depth,
-    const int output_height, const int output_width, const int ksize_depth,
-    const int ksize_height, const int ksize_width, const int stride_depth,
-    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, PoolProcess pool_process,
-    bool exclusive, bool adaptive, T* input_grad, bool channel_last = false) {
+    const int nthreads, const T* __restrict__ input_data,
+    const T* __restrict__ output_data, const T* __restrict__ output_grad,
+    const int channels, const int input_depth, const int input_height,
+    const int input_width, const int output_depth, const int output_height,
+    const int output_width, const int ksize_depth, const int ksize_height,
+    const int ksize_width, const int stride_depth, const int stride_height,
+    const int stride_width, const int padding_depth, const int padding_height,
+    const int padding_width, PoolProcess pool_process, bool exclusive,
+    bool adaptive, T* input_grad, bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int w_offset, h_offset, d_offset, offsetC, batch_idx;
+    int w_offset, h_offset, d_offset, c_offset, batch_idx, output_stride;
+    T input = static_cast<T>(0);
     if (!channel_last) { /* "NCDHW" */
       w_offset = index % input_width + padding_width;
       h_offset = (index / input_width) % input_height + padding_height;
       d_offset =
           (index / input_width / input_height) % input_depth + padding_depth;
-      offsetC = (index / input_width / input_height / input_depth) % channels;
+      c_offset = (index / input_width / input_height / input_depth) % channels;
       batch_idx = index / input_width / input_height / input_depth / channels;
-
+      output_stride = (batch_idx * channels + c_offset) * output_depth *
+                      output_height * output_width;
     } else { /* "NDHWC" */
-      offsetC = index % channels;
+      c_offset = index % channels;
       w_offset = (index / channels) % input_width + padding_width;
       h_offset =
           (index / channels / input_width) % input_height + padding_height;
       d_offset = (index / channels / input_width / input_height) % input_depth +
                  padding_depth;
       batch_idx = index / channels / input_width / input_height / input_depth;
+      output_stride =
+          batch_idx * output_depth * output_height * output_width * channels;
     }
 
     int pdstart, pdend;
@@ -746,20 +850,12 @@ __global__ void KernelPool3DGrad(
       phend = min((h_offset) / stride_height + 1, output_height);
       pwend = min((w_offset) / stride_width + 1, output_width);
     }
-
-    T gradient = static_cast<T>(0.0);
-    T input = input_data[index];
-
-    int output_stride;
-    if (!channel_last) {
-      output_stride = (batch_idx * channels + offsetC) * output_depth *
-                      output_height * output_width;
-    } else {
-      output_stride =
-          batch_idx * output_depth * output_height * output_width * channels;
+    if (pool_process.use_x) {
+      input = input_data[index];
+      output_data += output_stride;
     }
-    output_data += output_stride;
     output_grad += output_stride;
+    T input_grad_data = static_cast<T>(0.0);
 
     for (int pd = pdstart; pd < pdend; ++pd) {
       for (int ph = phstart; ph < phend; ++ph) {
@@ -792,16 +888,17 @@ __global__ void KernelPool3DGrad(
           int output_sub_idx =
               channel_last
                   ? ((pd * output_height + ph) * output_width + pw) * channels +
-                        offsetC
+                        c_offset
                   : (pd * output_height + ph) * output_width + pw;
-
-          pool_process.compute(input, output_data[output_sub_idx],
-                               output_grad[output_sub_idx],
-                               static_cast<T>(1.0 / pool_size), &gradient);
+          T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
+                                             : static_cast<T>(0);
+          pool_process.compute(input, ouput_value, output_grad[output_sub_idx],
+                               static_cast<T>(1.0 / pool_size),
+                               &input_grad_data);
         }
       }
     }
-    input_grad[index] = gradient;
+    input_grad[index] = input_grad_data;
   }
 }
 
@@ -1045,7 +1142,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+    KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
@@ -1099,7 +1196,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+    KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
@@ -1267,33 +1364,33 @@ __global__ void KernelMaxPool2dWithIdx(
     const int input_height, const int input_width, const int output_height,
     const int output_width, const int ksize_height, const int ksize_width,
     const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, bool adaptive, T1* output_data, T2* mask_data) {
+    const int padding_width, bool adaptive, T1* output_data, T2* mask_data,
+    FastDivModForPooling divmods) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
+    int hstart, hend, wstart, wend;
+    int w_offset, h_offset, c_offset, input_offset;
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(
+        index, false, divmods, 0, 0, input_width, input_height, &w_offset,
+        &h_offset, &c_offset, &input_offset);
+    input_data += input_offset;
 
-    int hstart, hend;
-    int wstart, wend;
     if (adaptive) {
-      hstart = AdaptStartIndex(ph, input_height, output_height);
-      hend = AdaptEndIndex(ph, input_height, output_height);
+      hstart = AdaptStartIndex(h_offset, input_height, output_height);
+      hend = AdaptEndIndex(h_offset, input_height, output_height);
 
-      wstart = AdaptStartIndex(pw, input_width, output_width);
-      wend = AdaptEndIndex(pw, input_width, output_width);
+      wstart = AdaptStartIndex(w_offset, input_width, output_width);
+      wend = AdaptEndIndex(w_offset, input_width, output_width);
     } else {
-      hstart = ph * stride_height - padding_height;
+      hstart = h_offset * stride_height - padding_height;
       hend = min(hstart + ksize_height, input_height);
       hstart = max(hstart, 0);
 
-      wstart = pw * stride_width - padding_width;
+      wstart = w_offset * stride_width - padding_width;
       wend = min(wstart + ksize_width, input_width);
       wstart = max(wstart, 0);
     }
 
-    input_data += (batch_idx * channels + c) * input_height * input_width;
     T1 ele = -FLT_MAX;
     int max_index = -1;
     for (int h = hstart; h < hend; ++h) {
@@ -1317,16 +1414,17 @@ __global__ void KernelMaxPool2DWithIdxGrad(
     const int output_height, const int output_width, const int ksize_height,
     const int ksize_width, const int stride_height, const int stride_width,
     const int padding_height, const int padding_width, bool adaptive,
-    T1* input_grad) {
+    T1* input_grad, FastDivModForPooling divmods) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width;
-    int h_offset = (index / input_width) % input_height;
-    int offsetC = (index / input_width / input_height) % channels;
-    int batch_idx = index / input_width / input_height / channels;
+    int phstart, phend, pwstart, pwend;
+    int w_offset, h_offset, c_offset, output_offset;
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(
+        index, false, divmods, 0, 0, output_width, output_height, &w_offset,
+        &h_offset, &c_offset, &output_offset);
+    mask_data += output_offset;
+    output_grad += output_offset;
 
-    int phstart, phend;
-    int pwstart, pwend;
     if (adaptive) {
       phstart = h_offset * output_height / input_height;
       phend =
@@ -1348,20 +1446,15 @@ __global__ void KernelMaxPool2DWithIdxGrad(
       pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
     }
 
-    T1 gradient = 0;
+    T1 input_grad_data = 0;
     int input_current_featuremap_idx = h_offset * input_width + w_offset;
-    int output_idx =
-        (batch_idx * channels + offsetC) * output_height * output_width;
-
-    mask_data += output_idx;
-    output_grad += output_idx;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
-          gradient += output_grad[ph * output_width + pw];
+          input_grad_data += output_grad[ph * output_width + pw];
       }
     }
-    input_grad[index] = gradient;
+    input_grad[index] = input_grad_data;
   }
 }
 
@@ -1405,11 +1498,14 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
+
+    auto pool_divmods =
+        FastDivModForPooling(input_channels, output_width, output_height);
     KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, adaptive, output_data,
-        mask_data);
+        mask_data, pool_divmods);
   }
 };
 
@@ -1449,11 +1545,13 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
+    auto pool_divmods =
+        FastDivModForPooling(input_channels, input_width, input_height);
     KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_height,
         input_width, output_height, output_width, ksize_height, ksize_width,
         stride_height, stride_width, padding_height, padding_width, adaptive,
-        input_grad_data);
+        input_grad_data, pool_divmods);
   }
 };
 
@@ -1542,7 +1640,8 @@ __global__ void KernelMaxPool3DWithIdxGrad(
     int w_offset = index % input_width;
     int h_offset = (index / input_width) % input_height;
     int d_offset = (index / input_width / input_height) % input_depth;
-    int offsetC = (index / input_width / input_height / input_depth) % channels;
+    int c_offset =
+        (index / input_width / input_height / input_depth) % channels;
     int batch_idx = index / input_width / input_height / input_depth / channels;
 
     int pdstart, pdend;
@@ -1577,10 +1676,10 @@ __global__ void KernelMaxPool3DWithIdxGrad(
       pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
     }
 
-    T1 gradient = 0;
+    T1 input_grad_data = 0;
     int input_current_feature_map_idx =
         (d_offset * input_height + h_offset) * input_width + w_offset;
-    int output_idx = (batch_idx * channels + offsetC) * output_depth *
+    int output_idx = (batch_idx * channels + c_offset) * output_depth *
                      output_height * output_width;
     mask += output_idx;
     output_grad += output_idx;
@@ -1590,12 +1689,12 @@ __global__ void KernelMaxPool3DWithIdxGrad(
         for (int pw = pwstart; pw < pwend; ++pw) {
           if (mask[(pd * output_height + ph) * output_width + pw] ==
               input_current_feature_map_idx)
-            gradient +=
+            input_grad_data +=
                 output_grad[(pd * output_height + ph) * output_width + pw];
         }
       }
     }
-    input_grad[index] = gradient;
+    input_grad[index] = input_grad_data;
   }
 }
 
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 3547de0a4d7b7..3715f6e26104a 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -68,8 +68,9 @@ class AvgPool {
 template <class T>
 class MaxPoolGrad {
  public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                             T* dx) {
+  static constexpr bool use_x = true;
+  HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
+                                 T* dx) {
     *dx += dy * static_cast<T>(x == y);
   }
 };
@@ -77,8 +78,9 @@ class MaxPoolGrad {
 template <class T>
 class AvgPoolGrad {
  public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                             T* dx) {
+  static constexpr bool use_x = false;
+  HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
+                                 T* dx) {
     *dx += (scale * dy);
   }
 };
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 757cac4e4ffce..37dafa5c4908f 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -301,23 +301,9 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
 namespace scatter {
 
 template <typename T>
-typename std::enable_if<std::is_same<T, platform::bfloat16>::value>::type
-elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
-                   const T* in, T* out) {
-#ifdef PADDLE_WITH_MKLDNN
-  onednn_handler_axpy(data_len, T(1.f), in, out);
-#else
-  blas->AXPY(data_len, T(1.f), in, out);
-#endif
-}
-
-template <typename T>
-typename std::enable_if<std::is_same<T, float>::value ||
-                        std::is_same<T, double>::value ||
-                        std::is_same<T, platform::complex<float>>::value ||
-                        std::is_same<T, platform::complex<double>>::value>::type
-elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
-                   const T* in, T* out) {
+typename std::enable_if<!std::is_integral<T>::value>::type elementwise_add_to(
+    BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
+    T* out) {
   blas->AXPY(data_len, T(1.f), in, out);
 }
 
@@ -330,6 +316,64 @@ typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
   }
 }
 
+template <typename T>
+typename std::enable_if<std::is_same<T, platform::bfloat16>::value>::type
+add_sparse_inputs(const std::vector<const framework::SelectedRows*>& inputs,
+                  const std::unordered_map<int64_t, size_t>& rows_to_id,
+                  int64_t input_width,
+                  const platform::CPUDeviceContext& context, T* out_data) {
+#ifndef PADDLE_WITH_MKLDNN
+  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+#endif
+  for (auto* input : inputs) {
+    if (input->rows().size() == 0) {
+      continue;
+    }
+    auto* input_data = input->value().data<T>();
+    auto& input_rows = input->rows();
+
+#ifdef PADDLE_WITH_MKLDNN
+    OneDNNAXPYHandler<T> axpy_handler(input_width, T(1.f));
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = rows_to_id.at(input_rows[i]);
+      axpy_handler(&input_data[i * input_width],
+                   &out_data[out_i * input_width]);
+    }
+#else
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = rows_to_id.at(input_rows[i]);
+      elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
+                            &input_data[i * input_width],
+                            &out_data[out_i * input_width]);
+    }
+#endif
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_same<T, platform::bfloat16>::value>::type
+add_sparse_inputs(const std::vector<const framework::SelectedRows*>& inputs,
+                  const std::unordered_map<int64_t, size_t>& rows_to_id,
+                  int64_t input_width,
+                  const platform::CPUDeviceContext& context, T* out_data) {
+  VLOG(4) << "[CPU] add_sparse_inputs <" << typeid(T).name();
+  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+  for (auto* input : inputs) {
+    if (input->rows().size() == 0) {
+      continue;
+    }
+    auto* input_data = input->value().data<T>();
+    auto& input_rows = input->rows();
+
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = rows_to_id.at(input_rows[i]);
+      elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
+                            &input_data[i * input_width],
+                            &out_data[out_i * input_width]);
+    }
+  }
+}
+
 template <typename T>
 struct MergeAdd<platform::CPUDeviceContext, T> {
   framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
@@ -435,21 +479,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
         rows_to_id[merge_rows[i]] = i;
       }
 
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      for (auto* input : inputs) {
-        if (input->rows().size() == 0) {
-          continue;
-        }
-        auto* input_data = input->value().data<T>();
-        auto& input_rows = input->rows();
-
-        for (size_t i = 0; i < input_rows.size(); i++) {
-          size_t out_i = rows_to_id[input_rows[i]];
-          elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
-                                &input_data[i * input_width],
-                                &out_data[out_i * input_width]);
-        }
-      }
+      add_sparse_inputs<T>(inputs, rows_to_id, input_width, context, out_data);
     }
   }
 };
diff --git a/paddle/fluid/operators/math/seq2col.h b/paddle/fluid/operators/math/seq2col.h
new file mode 100644
index 0000000000000..56134b6f0ea5c
--- /dev/null
+++ b/paddle/fluid/operators/math/seq2col.h
@@ -0,0 +1,186 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct Seq2ColFunctor {
+  Seq2ColFunctor(const T* seq, T* col, size_t seq_length, size_t frame_length,
+                 size_t n_frames, size_t hop_length)
+      : seq_(seq),
+        col_(col),
+        seq_length_(seq_length),
+        frame_length_(frame_length),
+        n_frames_(n_frames),
+        hop_length_(hop_length) {}
+
+  /*
+    Convert sequences to frames.
+
+    1. Dimension infomation:
+
+       Sequences                   Frames
+    (N, seq_length)  ->  (N, frame_length, n_frames)
+
+    2. Mapping from `i` to  `src_idx` and `trg_idx` can be derived from:
+
+      a. Notion
+        - `i` stands for the flattened index of a bunch of frames.
+        - `src_idx` and `trg_idx` are the 1D indices of seqs and frames
+          respectivly.
+
+      b. Sample idx
+        ```cpp
+        sample_idx = i / (n_frames_ * frame_length_);
+        ```
+
+      c. Maps `i` to `f` and `n`.
+        ```cpp
+        f = i % (n_frames_ * frame_length_) / n_frames_;
+        n = i % (n_frames_ * frame_length_) % n_frames_;
+        ```
+
+      d. Replace `sample_idx`, `f` and `n` in the following eqations:
+        ```cpp
+        src_idx = sample_idx * seq_length_ + n * hop_length_ + f;
+        trg_idx = sample_idx * n_frames_ * frame_length_ + f * n_frames_ + n;
+        col_[trg_idx] = seq_[src_idx];
+        ```
+
+      e. Result can be deduced shown in the function body below.
+  */
+  HOSTDEVICE void operator()(size_t i) const {
+    size_t src_idx;
+    size_t trg_idx;
+    src_idx = i / (n_frames_ * frame_length_) * seq_length_ +
+              i % (n_frames_ * frame_length_) % n_frames_ * hop_length_ +
+              i % (n_frames_ * frame_length_) / n_frames_;
+    trg_idx = i / (n_frames_ * frame_length_) * n_frames_ * frame_length_ +
+              i % (n_frames_ * frame_length_) / n_frames_ * n_frames_ +
+              i % (n_frames_ * frame_length_) % n_frames_;
+    col_[trg_idx] = seq_[src_idx];
+  }
+
+  const T* seq_;
+  T* col_;
+  size_t seq_length_;
+  size_t frame_length_;
+  size_t n_frames_;
+  size_t hop_length_;
+};
+
+template <typename T>
+struct Col2SeqFunctor {
+  Col2SeqFunctor(const T* col, T* seq, size_t seq_length, size_t frame_length,
+                 size_t n_frames, size_t hop_length)
+      : col_(col),
+        seq_(seq),
+        seq_length_(seq_length),
+        frame_length_(frame_length),
+        n_frames_(n_frames),
+        hop_length_(hop_length) {}
+
+  /*
+    Accumulate output gradient d_out to d_x.
+
+    1. Dimension infomation:
+
+              d_out                        d_x
+    (N, frame_length, n_frames)  ->  (N, seq_length)
+
+    2. Using a sliding window to find source indices from `d_out` according to
+       `i`:
+
+      a. Notion
+        - `i` stands for the flattened index of `d_x`.
+        - `seq_i` stands for a relative index of a `d_x` sample.
+        - `left`: Starting index of a frame window.
+        - `right`: Ending index of a frame window.
+
+      b. Sample idx
+        ```cpp
+        sample_idx = i / seq_length_;
+        ```
+
+      c. Slides a window with length of `frame_length` to find `f` and `n`.
+        - `n`: The idx of num_frames_, increases in each hop.
+        - `f`: The idx of frame_lengths_, relative idx from left of a sliding
+               window.
+
+      d. Accumulate all grads from d_out.
+        ```cpp
+        seq_[i] +=
+            col_[sample_idx * frame_length_ * n_frames_ + f * n_frames_ + n];
+        ```
+  */
+  HOSTDEVICE void operator()(size_t i) const {
+    size_t sample_idx = i / seq_length_;
+    size_t seq_i = i % seq_length_;
+
+    // Sliding window
+    seq_[i] = 0;  // Init seq_[i] to 0, and sums up all
+                  // grads from col_ in the while loop.
+
+    size_t n = get_start_frame_idx(seq_i);
+    size_t f;
+    size_t left = n * hop_length_;
+    size_t right = left + frame_length_ - 1;
+
+    while (left <= seq_i && right < seq_length_) {
+      f = seq_i - left;
+      seq_[i] +=
+          col_[sample_idx * frame_length_ * n_frames_ + f * n_frames_ + n];
+      // Next frame.
+      left += hop_length_;
+      right += hop_length_;
+      n += 1;
+    }
+  }
+
+  /*
+    Calculate minimum value of frame index `n` to satisfy the inequality:
+
+      seq_i <= right
+      ==> seq_i <= left + frame_length - 1
+      ==> seq_i <= hop_length_ * n + frame_length_ - 1
+  */
+  HOSTDEVICE size_t get_start_frame_idx(size_t seq_i) const {
+    int64_t tmp = seq_i + 1 - frame_length_;
+    if (tmp > 0) {
+      size_t n = tmp / hop_length_;
+      if (tmp % hop_length_ == 0) {
+        return n;
+      } else {
+        return n + 1;
+      }
+    } else {
+      return 0;
+    }
+  }
+
+  const T* col_;
+  T* seq_;
+  size_t seq_length_;
+  size_t frame_length_;
+  size_t n_frames_;
+  size_t hop_length_;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 58e57c3914f41..dd9940db29f77 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -380,6 +380,14 @@ class MatMulV2Kernel : public framework::OpKernel<T> {
     auto* Out = ctx.Output<Tensor>("Out");
     bool trans_x = ctx.Attr<bool>("trans_x");
     bool trans_y = ctx.Attr<bool>("trans_y");
+    PADDLE_ENFORCE_NE(framework::product(X->dims()), 0,
+                      platform::errors::InvalidArgument(
+                          "The Input(X) dims size must not be equal 0,"
+                          " but reviced dims size is 0. "));
+    PADDLE_ENFORCE_NE(framework::product(Y->dims()), 0,
+                      platform::errors::InvalidArgument(
+                          "The Input(Y) dims size must not be equal 0,"
+                          " but reviced dims size is 0. "));
     MatMulFunction<DeviceContext, T>(X, Y, Out, trans_x, trans_y, ctx);
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc
index 76101f19ab618..ed265edf003e0 100644
--- a/paddle/fluid/operators/mkldnn/axpy_handler.cc
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -34,76 +33,46 @@ namespace plat = paddle::platform;
 namespace {
 
 template <typename T>
-class AXPYMKLDNNHandler : public plat::MKLDNNHandlerT<T, dnnl::reorder> {
+class AXPYHandler {
  public:
-  AXPYMKLDNNHandler(const plat::MKLDNNDeviceContext &dev_ctx,
-                    const dnnl::engine mkldnn_engine, plat::Place cpu_place,
-                    int n, float alpha)
-      : plat::MKLDNNHandlerT<T, dnnl::reorder>(
-            dev_ctx, mkldnn_engine, cpu_place,
-            plat::CreateKey(dev_ctx, static_cast<int64_t>(n),
-                            plat::MKLDNNGetDataType<T>(), alpha, "-axpy")),
-        alpha_(alpha),
-        n_(n) {}
-
-  std::shared_ptr<dnnl::memory> AcquireMemory(void *ptr,
-                                              const std::string &suffix) {
-    /*Generate key*/
-    auto local_key = this->key_ + suffix;
-    auto mem_p = std::static_pointer_cast<dnnl::memory>(
-        this->dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto md = dnnl::memory::desc({n_}, plat::MKLDNNGetDataType<T>(),
-                                   dnnl::memory::format_tag::x);
-      mem_p = std::make_shared<dnnl::memory>(md, this->engine_, ptr);
-      this->dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
+  AXPYHandler(const dnnl::engine mkldnn_engine, int n, float alpha) {
+    platform::MKLDNNDeviceContext::tls().log_lib_version();
+    auto md = dnnl::memory::desc({n}, plat::MKLDNNGetDataType<T>(),
+                                 dnnl::memory::format_tag::x);
+    src_mem_ = dnnl::memory(md, mkldnn_engine, DNNL_MEMORY_NONE);
+    dst_mem_ = dnnl::memory(md, mkldnn_engine, DNNL_MEMORY_NONE);
+    dnnl::primitive_attr reorder_attr;
+    dnnl::post_ops post_operations;
+    if (alpha != 1.f) {
+      std::vector<float> scales(1, alpha);
+      reorder_attr.set_output_scales(0, scales);
     }
-    return mem_p;
-  }
+    post_operations.append_sum(1.0f);
 
-  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const T *x) {
-    return this->AcquireMemory(plat::to_void_cast(x), "@user_src_mem_p");
+    reorder_attr.set_post_ops(post_operations);
+    reorder_p_ = dnnl::reorder(src_mem_, dst_mem_, reorder_attr);
   }
 
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(T *y) {
-    return this->AcquireMemory(y, "@user_dst_mem_p");
+  dnnl::memory &AcquireSrcMemory(const T *x) {
+    src_mem_.set_data_handle(plat::to_void_cast<T>(x));
+    return src_mem_;
   }
 
-  std::shared_ptr<dnnl::reorder> AcquireReorder(
-      std::shared_ptr<dnnl::memory> dst_memory_p,
-      std::shared_ptr<dnnl::memory> src_memory_p) {
-    auto prim_key = this->key_ + "@reorder_p";
-    auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
-        this->dev_ctx_.GetBlob(prim_key));
-    if (reorder_p == nullptr) {
-      // Here we pass Postops to mimick y -> a*X + y
-      dnnl::primitive_attr reorder_attr;
-      dnnl::post_ops post_operations;
-      if (this->alpha_ != 1.f) {
-        std::vector<float> scales(1, this->alpha_);
-        reorder_attr.set_output_scales(0, scales);
-      }
-      post_operations.append_sum(1.0f);
-
-      reorder_attr.set_post_ops(post_operations);
-      reorder_p = std::make_shared<dnnl::reorder>(
-          *(src_memory_p), *(dst_memory_p), reorder_attr);
-      this->dev_ctx_.SetBlob(prim_key, reorder_p);
-    }
-    return reorder_p;
+  dnnl::memory &AcquireDstMemory(T *y) {
+    dst_mem_.set_data_handle(y);
+    return dst_mem_;
   }
 
+  const dnnl::reorder &AcquireReorder() { return reorder_p_; }
+
  private:
-  float alpha_;
-  int n_;
+  dnnl::memory src_mem_;
+  dnnl::memory dst_mem_;
+  dnnl::reorder reorder_p_;
 };
 
-template class AXPYMKLDNNHandler<float>;
-template class AXPYMKLDNNHandler<plat::bfloat16>;
-
-}  // anonnymouse namespace
+template class AXPYHandler<float>;
+template class AXPYHandler<plat::bfloat16>;
 
 template <typename T>
 static void naive_axpy(int n, T alpha, const T *x, T *y) {
@@ -114,39 +83,60 @@ static void naive_axpy(int n, T alpha, const T *x, T *y) {
   }
 }
 
+}  // anonnymouse namespace
+
 template <typename T>
-void onednn_handler_axpy(int n, T alpha, const T *x, T *y) {
-  // fallback to naive version
-  if (n < 100) {
-    naive_axpy(n, alpha, x, y);
-    return;
-  }
+class OneDNNAXPYHandler<T>::Impl {
+ public:
+  Impl(int64_t n, T alpha);
+  void operator()(const T *x, T *y);
+
+ private:
+  std::unique_ptr<AXPYHandler<T>> handler_;
+  int64_t n_;
+  T alpha_;
+};
 
+template <typename T>
+OneDNNAXPYHandler<T>::Impl::Impl(int64_t n, T alpha) : n_{n}, alpha_{alpha} {
   auto &pool = plat::DeviceContextPool::Instance();
   auto cpu_place = plat::CPUPlace();
   auto *dev_ctx =
       dynamic_cast<plat::MKLDNNDeviceContext *>(pool.Get(cpu_place));
   auto &cpu_engine = dev_ctx->GetEngine();
+  handler_ = std::make_unique<AXPYHandler<T>>(cpu_engine, n,
+                                              static_cast<float>(alpha));
+}
 
-  AXPYMKLDNNHandler<T> handler(*dev_ctx, cpu_engine, cpu_place, n,
-                               static_cast<float>(alpha));
-
-  auto reorder_src_memory_p = handler.AcquireSrcMemory(x);
-  auto reorder_dst_memory_p = handler.AcquireDstMemory(y);
-  auto reorder_p =
-      handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+template <typename T>
+void OneDNNAXPYHandler<T>::Impl::operator()(const T *x, T *y) {
+  if (this->n_ < 100) {
+    naive_axpy(this->n_, this->alpha_, x, y);
+    return;
+  }
 
+  auto &reorder_src_mem_p = handler_->AcquireSrcMemory(x);
+  auto &reorder_dst_mem_p = handler_->AcquireDstMemory(y);
+  auto reorder_p = handler_->AcquireReorder();
   auto &astream = plat::MKLDNNDeviceContext::tls().get_stream();
-  plat::RecordEvent record_reorder("axpy_int_reorder",
-                                   plat::EventRole::kUniqueOp);
-  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  reorder_p.execute(astream, reorder_src_mem_p, reorder_dst_mem_p);
   astream.wait();
 }
 
-template void onednn_handler_axpy<float>(int, float, const float *, float *);
-template void onednn_handler_axpy<plat::bfloat16>(int, plat::bfloat16,
-                                                  const plat::bfloat16 *,
-                                                  plat::bfloat16 *);
+template <typename T>
+OneDNNAXPYHandler<T>::OneDNNAXPYHandler(int64_t n, T alpha)
+    : pimpl_{new Impl{n, alpha}, [](Impl *impl) { delete impl; }} {
+  VLOG(4) << "[OneDNN] OneDNNAXPYHandler<" << typeid(T).name() << ">, "
+          << "n: " << n << ", alpha: " << alpha;
+}
+
+template <typename T>
+void OneDNNAXPYHandler<T>::operator()(const T *x, T *y) {
+  pimpl_->operator()(x, y);
+}
+
+template class OneDNNAXPYHandler<float>;
+template class OneDNNAXPYHandler<plat::bfloat16>;
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.h b/paddle/fluid/operators/mkldnn/axpy_handler.h
index 8f0fdeb5c02b4..677fe3b010c24 100644
--- a/paddle/fluid/operators/mkldnn/axpy_handler.h
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.h
@@ -13,21 +13,47 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
 ///
-/// @brief      Helper function to execute AXPY using oneDNN.
-///
-/// @param[in]  n      The number of elements in tensor (assumed 1D)
-/// @param[in]  alpha  The alpha coefficient.
-/// @param[in]  x      The pointer to input X tensor.
-/// @param      y      The pointer to output Y tensor.
+/// @brief      Helper class for AXPY execution using oneDNN library.
 ///
-/// @tparam     T      Data type.
+/// @tparam     T     Data type.
 ///
 template <typename T>
-void onednn_handler_axpy(int n, T alpha, const T *x, T *y);
+class OneDNNAXPYHandler {
+ public:
+  OneDNNAXPYHandler(OneDNNAXPYHandler&) = delete;
+  OneDNNAXPYHandler(OneDNNAXPYHandler&&) = delete;
+  OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&) = delete;
+  OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&&) = delete;
+  ///
+  /// @brief      Constructor.
+  ///
+  /// @param[in]  n      The number of elements in tensor (assumed 1D tensor)
+  /// @param[in]  alpha  The alpha coefficient.
+  ///
+  OneDNNAXPYHandler(int64_t n, T alpha);
+  ///
+  /// @brief      Executes AXPY.
+  ///
+  /// @param[in]  x     The pointer to input X tensor data.
+  /// @param[out] y     The pointer to output Y tensor data.
+  ///
+  void operator()(const T* x, T* y);
+
+ private:
+  OneDNNAXPYHandler() = delete;
+  // (arogowie-intel) Private implementation idiom to hide dependency
+  // on OneDNN headers.
+  class Impl;
+  // We need custom deleter, since the compiler is unable to parameterize
+  // an allocator's default deleter due to incomple type.
+  std::unique_ptr<Impl, void (*)(Impl*)> pimpl_;
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
index 9cfeace6bef99..6f79c2dccf56b 100644
--- a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
@@ -43,11 +43,9 @@ class CastMKLDNNKernel : public framework::OpKernel<T> {
 
     auto x_tz = framework::vectorize(x->dims());
 
-    std::string key =
-        platform::CreateKey(dev_ctx, x_tz, x->format(), x->format(), x_type);
-    platform::ReorderMKLDNNHandler reorder_handler(
-        x_tz, x_paddle_type, x_type, out_paddle_type, out_type, dev_ctx,
-        dev_ctx.GetEngine(), key);
+    platform::ReorderMKLDNNHandler reorder_handler(x_tz, x_paddle_type, x_type,
+                                                   out_paddle_type, out_type,
+                                                   dev_ctx.GetEngine());
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
         x->format(), platform::to_void_cast(x->data<T>()));
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index b353ce4c322e4..1b69dd7ea00c7 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -706,7 +706,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         platform::CreateKey(dev_ctx, src_tz, src_dt,
                             ctx.InputName("Input") + ctx.InputName("Filter"));
 
-    const std::string key_conv_pd = key + "@conv_pd";
     bool need_s8_to_u8 = false;
     std::shared_ptr<mkldnn::convolution_forward> conv_p;
     std::shared_ptr<mkldnn::memory> src_memory_p;
@@ -721,6 +720,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     // are merged/unified, this will disappear
     auto key_tid = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
 
+    const std::string key_conv_pd = key_tid + "@conv_pd";
     auto prim_key = key_tid + "@conv_p";
     auto dst_key = key_tid + "@dst_mem_p";
     auto src_key = key_tid + "@src_mem_p";
@@ -731,12 +731,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto src_reorder_key = key_tid + "@src_mem_preorder_p";
     auto residual_reorder_key = key_tid + "@residual_data_mem_preorder_p";
 
-    conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-        dev_ctx.GetBlob(prim_key));
+    conv_pd =
+        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_conv_pd));
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
-    if (conv_p == nullptr || !is_test) {
+    if (conv_pd == nullptr || !is_test) {
       float fuse_alpha = ctx.Attr<float>("fuse_alpha");
       float fuse_beta = ctx.Attr<float>("fuse_beta");
       bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
@@ -946,7 +947,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       // create convolution op primitive
-      auto scale_bias_key = key + "@scale_bias";
       conv_p = handler->AcquireConvolution();
       if (bias) {
         const K* bias_data = bias->data<K>();
@@ -1000,13 +1000,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           dev_ctx.GetBlob(weights_key));
       dst_memory_p =
           std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key));
-      conv_pd =
-          std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
-              dev_ctx.GetBlob(key_conv_pd));
-      if (conv_pd) {
-        handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
-                                                      mkldnn_engine, key));
-      }
+      conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+          dev_ctx.GetBlob(prim_key));
+      handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
+                                                    mkldnn_engine, key));
 
       if (fuse_residual_conn) {
         auto residual_param = ctx.Input<Tensor>("ResidualData");
@@ -1125,12 +1122,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         mkldnn::memory::format_tag out_format =
             weights_tz.size() == 6 ? mkldnn::memory::format_tag::goidhw
                                    : mkldnn::memory::format_tag::goihw;
-        std::string key = platform::CreateKey(dev_ctx, weights_tz, filter_fmt,
-                                              out_format, in_type);
-        key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
-
-        platform::ReorderMKLDNNHandler handler(
-            weights_tz, filter->type(), in_type, dev_ctx, mkldnn_engine, key);
+        platform::ReorderMKLDNNHandler handler(weights_tz, filter->type(),
+                                               in_type, mkldnn_engine);
         auto reorder_dst_memory_p =
             handler.AcquireDstMemory(filter_grad, out_format, ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index 829c948c1a5d1..d537c3dbf9fdd 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -114,10 +114,8 @@ class ExpandGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     if (dout_vec_dims == dx_vec_dims) {
       mkldnn::memory::data_type dout_type =
           paddle::framework::ToMKLDNNDataType(dout->type());
-      std::string key = paddle::platform::CreateKey(
-          dev_ctx, dout_vec_dims, dout->format(), dout->format(), dout_type);
       paddle::platform::ReorderMKLDNNHandler reorder_handler(
-          dout_vec_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key);
+          dout_vec_dims, dout->type(), dout_type, onednn_engine);
 
       auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
           dout->format(), paddle::platform::to_void_cast(dout->data<T>()));
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index 723c3c8352d54..b78acd32e6dc8 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -58,11 +58,8 @@ static Tensor FoldFirstAndLastDims(const MKLDNNDeviceContext& dev_ctx,
 
   memory::data_type input_type =
       paddle::framework::ToMKLDNNDataType(input->type());
-  std::string key = paddle::platform::CreateKey(
-      dev_ctx, input_dims, input->format(), input->format(), input_type);
   paddle::platform::ReorderMKLDNNHandler reorder_handler(
-      output_dims, input->type(), input_type, dev_ctx, dev_ctx.GetEngine(),
-      key);
+      output_dims, input->type(), input_type, dev_ctx.GetEngine());
 
   auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
       memory::format_tag::abc,
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index d5e428bd805fb..e6a7f3e74fcc7 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -93,10 +93,8 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
     }
 
     mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type());
-    std::string key =
-        platform::CreateKey(dev_ctx, x_vec_dims, x->format(), x_type);
-    platform::ReorderMKLDNNHandler reorder_handler(
-        x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key);
+    platform::ReorderMKLDNNHandler reorder_handler(x_vec_dims, x->type(),
+                                                   x_type, onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
         x->format(), platform::to_void_cast(x->data<T>()));
@@ -253,11 +251,8 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel<T> {
 
     mkldnn::memory::data_type dout_type =
         framework::ToMKLDNNDataType(dout->type());
-    std::string key =
-        platform::CreateKey(dev_ctx, dout_vec_dims, this->getPlainFormatTag(dx),
-                            dx->format(), dout_type);
-    platform::ReorderMKLDNNHandler reorder_handler(
-        dout_vec_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key);
+    platform::ReorderMKLDNNHandler reorder_handler(dout_vec_dims, dout->type(),
+                                                   dout_type, onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
         dout->format(), platform::to_void_cast(dout->data<T>()));
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index facf5ca4b8397..6bc3413604e22 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -98,18 +98,16 @@ class SliceMKLDNNKernel : public framework::OpKernel<T> {
     out->Resize(framework::make_ddim(slice_dims));
 
     mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type());
-    auto key = platform::CreateKey(dev_ctx, x_vec_dims, axes, starts, ends,
-                                   x->format(), x_type);
 
-    platform::ReorderMKLDNNHandler reorder_handler(
-        x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key);
+    platform::ReorderMKLDNNHandler reorder_handler(x_vec_dims, x->type(),
+                                                   x_type, onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
         x->format(), platform::to_void_cast(x->data<T>()));
     auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets,
                                                         reorder_src_memory_p);
     auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        out, slice_dims, 0, get_plain_format_tag(x), ctx.GetPlace());
+        out, slice_dims, get_plain_format_tag(x), ctx.GetPlace());
 
     auto reorder_p =
         reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
@@ -201,16 +199,13 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
     mkldnn::memory::format_tag reorder_format_tag =
         platform::GetMKLDNNFormat(md.reshape(slice_dims));
 
-    auto key = platform::CreateKey(dev_ctx, dout_vec_dims, axes, starts, ends,
-                                   reorder_format_tag, dout_type);
-
-    platform::ReorderMKLDNNHandler reorder_handler(
-        slice_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key);
+    platform::ReorderMKLDNNHandler reorder_handler(slice_dims, dout->type(),
+                                                   dout_type, onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
         reorder_format_tag, platform::to_void_cast(dout->data<T>()));
     auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        dx, dx_vec_dims, 0, reorder_format_tag, ctx.GetPlace());
+        dx, dx_vec_dims, reorder_format_tag, ctx.GetPlace());
     memset(dx->data<T>(), 0, reorder_dst_memory_p->get_desc().get_size());
 
     auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets,
diff --git a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
index 8a58d9f26f87b..411f33276c361 100644
--- a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
@@ -91,27 +91,25 @@ class SplitMKLDNNKernel : public framework::OpKernel<T> {
     auto x_vec_dims = framework::vectorize(x_dims);
 
     mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type());
-    auto key = platform::CreateKey(dev_ctx, x_vec_dims, axis, num, sections,
-                                   x->format(), x_type);
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
     std::vector<int64_t> offset(x_vec_dims.size(), 0);
 
-    platform::ReorderMKLDNNHandler reorder_handler(
-        x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key);
+    platform::ReorderMKLDNNHandler reorder_handler(x_vec_dims, x->type(),
+                                                   x_type, onednn_engine);
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
         x->format(), platform::to_void_cast(x->data<T>()));
 
     for (size_t i = 0; i < outs_number; ++i) {
       auto out_vec_dims = framework::vectorize(outs[i]->dims());
-      auto slice_mem_p = reorder_handler.AcquireSubmemory(
-          out_vec_dims, offset, reorder_src_memory_p, i);
+      auto slice_mem_p = reorder_handler.AcquireSubmemory(out_vec_dims, offset,
+                                                          reorder_src_memory_p);
 
       auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-          outs[i], out_vec_dims, i, x->format(), ctx.GetPlace());
+          outs[i], out_vec_dims, x->format(), ctx.GetPlace());
       auto reorder_p =
-          reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p, i);
+          reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
 
       reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
 
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 4cc9f53b9b6b2..8208a484b4a32 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -155,15 +155,11 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     // For in-place execution which sum does not have we need to fake it
     // so from oneDNN dst memory we reorder data into input
     if (in_place) {
-      const std::string reorder_key =
-          platform::CreateKey(dev_ctx, framework::vectorize(output->dims()),
-                              ctx.OutputName("Out") + "-I");
-
       auto& in_out = in_vars[0]->Get<framework::LoDTensor>();
       auto output_tz = framework::vectorize<int64_t>(output->dims());
       platform::ReorderMKLDNNHandler reorder_handler(
           output_tz, output->type(), framework::ToMKLDNNDataType(in_out.type()),
-          dev_ctx, dev_ctx.GetEngine(), reorder_key);
+          dev_ctx.GetEngine());
 
       auto target_mem = reorder_handler.AcquireDstMemory(
           output, in_out.format(), ctx.GetPlace());
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
index 64331b88bfc04..505e322310caf 100644
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -29,6 +29,14 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
+    for (size_t i = 0; i < ins.size(); ++i) {
+      PADDLE_ENFORCE_GT(
+          ins[i]->numel(), 0,
+          platform::errors::OutOfRange(
+              "indexing will be out of bounds with size 0 for the %d-th input.",
+              i));
+    }
+
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
index cb8d5eb2f761d..c0f24a2034a15 100644
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
@@ -31,6 +31,14 @@ class MultiplexCPUKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
 
+    for (size_t i = 0; i < ins.size(); ++i) {
+      PADDLE_ENFORCE_GT(
+          ins[i]->numel(), 0,
+          platform::errors::OutOfRange(
+              "indexing will be out of bounds with size 0 for the %d-th input.",
+              i));
+    }
+
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     auto index = ids->data<int32_t>();
diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
new file mode 100644
index 0000000000000..450ef376799d3
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+
+namespace pplat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class SGDOneDNNKernel : public SGDOpKernel<pplat::CPUDeviceContext, T> {
+ protected:
+  void dense_param_and_grad_kernel(
+      const framework::ExecutionContext &ctx) const override {
+    VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, LodTensor>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+
+    auto *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+    const T *param_data = param->data<T>();
+    const auto *grad_data = grad->data<T>();
+    const auto *lr = learning_rate->data<T>();
+    // Since denese SGD is not in place operation, first copy params to output
+    // tensor and then update it.
+    std::memcpy(out_data, param_data, param->memory_size());
+    OneDNNAXPYHandler<T>(param_out->numel(), -lr[0])(grad_data, out_data);
+  }
+
+  void dense_param_sparse_grad_kernel(
+      const framework::ExecutionContext &ctx) const override {
+    VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, SelectedRows>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    const auto &grad_value = grad->value();
+    const auto &grad_rows = grad->rows();
+    const auto grad_height = grad->height();
+    const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+    const auto grad_width = grad_value.numel() / grad_val_height;
+
+    const auto *grad_data = grad_value.data<T>();
+    auto *out_data = param_out->data<T>();
+    const auto *lr = learning_rate->data<T>();
+
+    OneDNNAXPYHandler<T> axpy_handler(grad_width, -lr[0]);
+
+    for (size_t i = 0; i < grad_rows.size(); ++i) {
+      PADDLE_ENFORCE_LT(
+          grad_rows[i], grad_height,
+          pplat::errors::OutOfRange(
+              "Grad rows index value should be less than grad height."
+              "Got [%s], but expected less than [%s]",
+              grad_rows[i], grad_height));
+      const int64_t row = grad_rows[i];
+      const auto *src = grad_data + i * grad_width;
+      auto *dst = out_data + row * grad_width;
+      axpy_handler(src, dst);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(sgd, MKLDNN, pplat::CPUPlace, ops::SGDOneDNNKernel<float>,
+                   ops::SGDOneDNNKernel<pplat::bfloat16>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 9603411ec4513..b2e258f815d72 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -67,6 +70,26 @@ class SGDOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Param");
+
+#ifdef PADDLE_WITH_MKLDNN
+    using mkldnn::memory;
+    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
+      const auto *param_var = ctx.InputVar("Param");
+      const auto *grad_var = ctx.InputVar("Grad");
+
+      // supported cases
+      bool dense_param_sparse_grad =
+          param_var->IsType<framework::LoDTensor>() &&
+          grad_var->IsType<framework::SelectedRows>();
+      bool dense_param_and_grad = param_var->IsType<framework::LoDTensor>() &&
+                                  grad_var->IsType<framework::LoDTensor>();
+
+      if (dense_param_sparse_grad || dense_param_and_grad)
+        return framework::OpKernelType(data_type, ctx.GetPlace(),
+                                       framework::DataLayout::kMKLDNN,
+                                       framework::LibraryType::kMKLDNN);
+    }
+#endif
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 
@@ -106,6 +129,10 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("ParamOut",
               "(Tensor or SelectedRows, same with Param) "
               "Output parameter, should share the same memory with Param");
+    AddAttr<bool>(
+        "use_mkldnn",
+        "(bool, default false) Indicates if MKL-DNN kernel will be used")
+        .SetDefault(false);
     AddComment(R"DOC(
 
 SGD operator
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 076afdc655386..a1fb3debb48e6 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -19,9 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
-#endif
 #include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
@@ -142,98 +139,13 @@ struct sgd_dense_param_kernel<
               "Got [%s], but expected less than [%s]",
               grad_rows[i], grad_height));
       const int64_t row = grad_rows[i];
-#ifdef PADDLE_WITH_MKLDNN
-      operators::onednn_handler_axpy(grad_width, -lr[0],
-                                     grad_data + i * grad_width,
-                                     out_data + row * grad_width);
-#else
       for (int64_t j = 0; j < grad_width; ++j) {
         out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
       }
-#endif
     }
   }
 };
 
-template <typename T>
-void sgd_op_invoke_dense_param_kernel(const framework::ExecutionContext &ctx) {
-  const auto *param = ctx.Input<framework::Tensor>("Param");
-  auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-  const auto *grad_var = ctx.InputVar("Grad");
-
-  if (grad_var->IsType<framework::LoDTensor>()) {
-    const auto *grad = ctx.Input<framework::Tensor>("Grad");
-    const auto sz = param_out->numel();
-    PADDLE_ENFORCE_EQ(param->numel(), sz,
-                      platform::errors::InvalidArgument(
-                          "The input tensor Param's numel of SgdOp "
-                          "should be equal with ParamOut's numel. "
-                          "But received Param's "
-                          "numel = [%s], ParamOut's numel = [%s]",
-                          param->numel(), sz));
-    PADDLE_ENFORCE_EQ(grad->numel(), sz,
-                      platform::errors::InvalidArgument(
-                          "The input tensor Grad's numel of SgdOp "
-                          "should be equal with ParamOut's numel. "
-                          "But received Grad's "
-                          "numel = [%s], ParamOut's numel = [%s]",
-                          grad->numel(), sz));
-
-    sgd_dense_param_kernel<
-        T, framework::VarTypeTrait<framework::LoDTensor>::kId>()(ctx);
-  } else if (grad_var->IsType<framework::SelectedRows>()) {
-    // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-    // This manual optimization brings difficulty to track data dependency.
-    // It's better to find a more elegant solution.
-    PADDLE_ENFORCE_EQ(param, param_out,
-                      platform::errors::InvalidArgument(
-                          "The input tensor Param of SgdOp "
-                          "should be equal with ParamOut if variable's "
-                          "type is SelectedRows. "));
-    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
-
-    // for distributed training, a sparse var may be empty,
-    // just skip updating.
-    if (grad->rows().size() == 0) {
-      return;
-    }
-
-    auto out_dims = param_out->dims();
-    PADDLE_ENFORCE_EQ(
-        grad->height(), out_dims[0],
-        platform::errors::InvalidArgument(
-            "The input tensor Grad's height of SgdOp "
-            "should be equal with ParamOut's dims. But received  Grad's "
-            "height [%s] and ParamOut's dims [%s]",
-            grad->height(), out_dims[0]));
-
-    auto &grad_value = grad->value();
-    auto &grad_rows = grad->rows();
-    const auto param_height = param_out->dims()[0];
-    const auto param_width = param_out->numel() / param_height;
-    // note: it is not grad->height()
-    const auto grad_height = static_cast<int64_t>(grad_rows.size());
-    const auto grad_width = grad_value.numel() / grad_height;
-
-    PADDLE_ENFORCE_EQ(
-        grad_width, param_width,
-        platform::errors::InvalidArgument(
-            "The grad_value's numel of SgdOp "
-            "should be equal with param_out's numel. But received "
-            "grad_value's numel [%s] and param_out's numel [%s]",
-            grad_width, param_width));
-
-    sgd_dense_param_kernel<
-        T, framework::VarTypeTrait<framework::SelectedRows>::kId>()(ctx);
-  } else {
-    PADDLE_ENFORCE_EQ(
-        false, true, platform::errors::PermissionDenied(
-                         "Unsupported Variable Type of Grad in SgdOp. Excepted "
-                         "LodTensor or SelectedRows, But received [%s]",
-                         paddle::framework::ToTypeName(grad_var->Type())));
-  }
-}
-
 }  // namespace detail
 
 template <typename DeviceContext, typename T>
@@ -247,61 +159,157 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
     const auto *param_var = ctx.InputVar("Param");
-    const auto *grad_var = ctx.InputVar("Grad");
 
     if (param_var->IsType<framework::LoDTensor>()) {
-      detail::sgd_op_invoke_dense_param_kernel<T>(ctx);
+      invoke_dense_param_kernel(ctx);
     } else if (param_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
+      sparse_param_and_grad_kernel(ctx);
+    } else {
+      PADDLE_ENFORCE_EQ(
+          false, true,
+          platform::errors::PermissionDenied(
+              "Unsupported Variable Type of Parameter in SgdOp. Excepted "
+              "LodTensor or SelectedRows, But received [%s]",
+              paddle::framework::ToTypeName(param_var->Type())));
+    }
+  }
+
+ protected:
+  void invoke_dense_param_kernel(const framework::ExecutionContext &ctx) const {
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      const auto *grad = ctx.Input<framework::Tensor>("Grad");
+      const auto sz = param_out->numel();
+      PADDLE_ENFORCE_EQ(param->numel(), sz,
                         platform::errors::InvalidArgument(
-                            "When param is SelectedRows, gradient should also "
-                            "be SelectedRows"));
-      const auto &param = param_var->Get<framework::SelectedRows>();
-      auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
-      const auto &grad = grad_var->Get<framework::SelectedRows>();
+                            "The input tensor Param's numel of SgdOp "
+                            "should be equal with ParamOut's numel. "
+                            "But received Param's "
+                            "numel = [%s], ParamOut's numel = [%s]",
+                            param->numel(), sz));
+      PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                        platform::errors::InvalidArgument(
+                            "The input tensor Grad's numel of SgdOp "
+                            "should be equal with ParamOut's numel. "
+                            "But received Grad's "
+                            "numel = [%s], ParamOut's numel = [%s]",
+                            grad->numel(), sz));
+
+      dense_param_and_grad_kernel(ctx);
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+      // This manual optimization brings difficulty to track data dependency.
+      // It's better to find a more elegant solution.
+      PADDLE_ENFORCE_EQ(param, param_out,
+                        platform::errors::InvalidArgument(
+                            "The input tensor Param of SgdOp "
+                            "should be equal with ParamOut if variable's "
+                            "type is SelectedRows. "));
+      const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
 
       // for distributed training, a sparse var may be empty,
       // just skip updating.
-      if (grad.rows().size() == 0) {
+      if (grad->rows().size() == 0) {
         return;
       }
 
-      auto param_row_width = param.value().dims()[1];
-      auto grad_row_width = grad.value().dims()[1];
+      auto out_dims = param_out->dims();
       PADDLE_ENFORCE_EQ(
-          param_row_width, grad_row_width,
+          grad->height(), out_dims[0],
           platform::errors::InvalidArgument(
-              "The param_row in SgdOP should have the same size with grad_row. "
-              "But received param_row's width is [%s], and grad_row's width is "
-              "[%s]",
-              param_row_width, grad_row_width));
-
-      const auto *lr = learning_rate->data<T>();
-      const auto *grad_data = grad.value().data<T>();
-      auto *out_data = param_out->mutable_value()->data<T>();
-      for (size_t i = 0; i < grad.rows().size(); i++) {
-        int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
-        PADDLE_ENFORCE_GE(
-            id_index, static_cast<int64_t>(0),
-            platform::errors::InvalidArgument(
-                "The id in SgdOp should be >= 0. But recevied id_index is [%s]",
-                id_index));
-        for (int64_t j = 0; j < grad_row_width; j++) {
-          out_data[id_index * grad_row_width + j] -=
-              lr[0] * grad_data[i * grad_row_width + j];
-        }
-      }
+              "The input tensor Grad's height of SgdOp "
+              "should be equal with ParamOut's dims. But received  Grad's "
+              "height [%s] and ParamOut's dims [%s]",
+              grad->height(), out_dims[0]));
+
+      auto &grad_value = grad->value();
+      auto &grad_rows = grad->rows();
+      const auto param_height = param_out->dims()[0];
+      const auto param_width = param_out->numel() / param_height;
+      // note: it is not grad->height()
+      const auto grad_height = static_cast<int64_t>(grad_rows.size());
+      const auto grad_width = grad_value.numel() / grad_height;
+
+      PADDLE_ENFORCE_EQ(
+          grad_width, param_width,
+          platform::errors::InvalidArgument(
+              "The grad_value's numel of SgdOp "
+              "should be equal with param_out's numel. But received "
+              "grad_value's numel [%s] and param_out's numel [%s]",
+              grad_width, param_width));
+
+      dense_param_sparse_grad_kernel(ctx);
     } else {
       PADDLE_ENFORCE_EQ(
           false, true,
           platform::errors::PermissionDenied(
-              "Unsupported Variable Type of Parameter in SgdOp. Excepted "
+              "Unsupported Variable Type of Grad in SgdOp. Excepted "
               "LodTensor or SelectedRows, But received [%s]",
-              paddle::framework::ToTypeName(param_var->Type())));
+              paddle::framework::ToTypeName(grad_var->Type())));
+    }
+  }
+
+  void sparse_param_and_grad_kernel(
+      const framework::ExecutionContext &ctx) const {
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param_var = ctx.InputVar("Param");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
+                      platform::errors::InvalidArgument(
+                          "When param is SelectedRows, gradient should also "
+                          "be SelectedRows"));
+    const auto &param = param_var->Get<framework::SelectedRows>();
+    auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
+    const auto &grad = grad_var->Get<framework::SelectedRows>();
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad.rows().size() == 0) {
+      return;
     }
+
+    auto param_row_width = param.value().dims()[1];
+    auto grad_row_width = grad.value().dims()[1];
+    PADDLE_ENFORCE_EQ(
+        param_row_width, grad_row_width,
+        platform::errors::InvalidArgument(
+            "The param_row in SgdOP should have the same size with grad_row. "
+            "But received param_row's width is [%s], and grad_row's width is "
+            "[%s]",
+            param_row_width, grad_row_width));
+
+    const auto *lr = learning_rate->data<T>();
+    const auto *grad_data = grad.value().data<T>();
+    auto *out_data = param_out->mutable_value()->data<T>();
+    for (size_t i = 0; i < grad.rows().size(); i++) {
+      int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
+      PADDLE_ENFORCE_GE(
+          id_index, static_cast<int64_t>(0),
+          platform::errors::InvalidArgument(
+              "The id in SgdOp should be >= 0. But recevied id_index is [%s]",
+              id_index));
+      for (int64_t j = 0; j < grad_row_width; j++) {
+        out_data[id_index * grad_row_width + j] -=
+            lr[0] * grad_data[i * grad_row_width + j];
+      }
+    }
+  }
+
+  virtual void dense_param_and_grad_kernel(
+      const framework::ExecutionContext &ctx) const {
+    detail::sgd_dense_param_kernel<
+        T, framework::VarTypeTrait<framework::LoDTensor>::kId>()(ctx);
+  }
+
+  virtual void dense_param_sparse_grad_kernel(
+      const framework::ExecutionContext &ctx) const {
+    detail::sgd_dense_param_kernel<
+        T, framework::VarTypeTrait<framework::SelectedRows>::kId>()(ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/overlap_add_op.cc b/paddle/fluid/operators/overlap_add_op.cc
new file mode 100644
index 0000000000000..627c613e297d8
--- /dev/null
+++ b/paddle/fluid/operators/overlap_add_op.cc
@@ -0,0 +1,188 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/overlap_add_op.h"
+
+namespace paddle {
+namespace operators {
+
+class OverlapAddOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "overlap_add");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "overlap_add");
+
+    const int hop_length = ctx->Attrs().Get<int>("hop_length");
+    const int axis = ctx->Attrs().Get<int>("axis");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const int x_rank = x_dims.size();
+
+    PADDLE_ENFORCE_GE(
+        x_rank, 2,
+        platform::errors::InvalidArgument(
+            "Input(X) of OverlapAddOp should be a tensor which contains "
+            "at least 2 dimensions, but got rank %s.",
+            x_rank));
+
+    PADDLE_ENFORCE_GT(
+        hop_length, 0,
+        platform::errors::InvalidArgument(
+            "Attribute(hop_length) of OverlapAddOp should be greater "
+            "than 0, but got %s.",
+            hop_length));
+
+    PADDLE_ENFORCE_EQ(
+        (axis == 0 || axis == -1), true,
+        platform::errors::InvalidArgument(
+            "Attribute(axis) of OverlapAddOp should 0 or -1, but got %s.",
+            axis));
+
+    std::vector<int64_t> output_shape;
+    int n_frames;
+    int frame_length;
+
+    int start_axis;
+    int end_axis;
+    if (axis == 0) {
+      n_frames = x_dims[0];
+      frame_length = x_dims[1];
+      start_axis = 2;
+      end_axis = x_rank - 1;
+    } else {
+      n_frames = x_dims[x_rank - 1];
+      frame_length = x_dims[x_rank - 2];
+      start_axis = 0;
+      end_axis = x_rank - 3;
+    }
+
+    PADDLE_ENFORCE_LE(
+        hop_length, frame_length,
+        platform::errors::InvalidArgument(
+            "Attribute(hop_length) of OverlapAddOp should be less or equal "
+            "than frame_length, but got hop_length(%s) > frame_length(%s).",
+            hop_length, frame_length));
+
+    const int seq_length = (n_frames - 1) * hop_length + frame_length;
+
+    // It won't go into for loop when x_rank == 2U.
+    for (int i = start_axis; i <= end_axis; i++) {
+      output_shape.push_back(x_dims[i]);
+    }
+
+    if (axis == 0) {
+      // (seq_length, ...)
+      output_shape.insert(output_shape.begin(), seq_length);
+    } else {
+      // (..., seq_length)
+      output_shape.push_back(seq_length);
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(in_dtype, ctx.GetPlace());
+  }
+};
+
+class OverlapAddOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of overlap_add op.");
+    AddOutput("Out", "(Tensor), The output tensor of overlap_add op.");
+    AddAttr<int>("hop_length",
+                 "Number of steps to advance between adjacent frames and "
+                 "`0 < hop_length <= frame_length`.");
+    AddAttr<int>("axis",
+                 "Specify the axis to operate on the input Tensors. Its value "
+                 "should be 0(the first dimension) or -1(the last dimension).")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+      Reconstructs a tensor consisted of overlap added sequences from input frames.
+    )DOC");
+  }
+};
+
+class OverlapAddOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "overlap_add_grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "overlap_add_grad");
+    const auto x_dims = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(in_dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class OverlapAddOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("overlap_add_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(overlap_add, ops::OverlapAddOp, ops::OverlapAddOpMaker,
+                  ops::OverlapAddOpGradMaker<paddle::framework::OpDesc>,
+                  ops::OverlapAddOpGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(overlap_add_grad, ops::OverlapAddOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    overlap_add, ops::OverlapAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext,
+                          paddle::platform::complex<float>>,
+    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext,
+                          paddle::platform::complex<double>>);
+
+REGISTER_OP_CPU_KERNEL(
+    overlap_add_grad,
+    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::complex<float>>,
+    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/overlap_add_op.cu b/paddle/fluid/operators/overlap_add_op.cu
new file mode 100644
index 0000000000000..2b7935e0191b7
--- /dev/null
+++ b/paddle/fluid/operators/overlap_add_op.cu
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/overlap_add_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    overlap_add,
+    ops::OverlapAddKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::OverlapAddKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::OverlapAddKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::OverlapAddKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::OverlapAddKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>,
+    ops::OverlapAddKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::complex<float>>,
+    ops::OverlapAddKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::complex<double>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    overlap_add_grad,
+    ops::OverlapAddGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::OverlapAddGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::OverlapAddGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::OverlapAddGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::OverlapAddGradKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
+    ops::OverlapAddGradKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::complex<float>>,
+    ops::OverlapAddGradKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/overlap_add_op.h b/paddle/fluid/operators/overlap_add_op.h
new file mode 100644
index 0000000000000..865659ee942e4
--- /dev/null
+++ b/paddle/fluid/operators/overlap_add_op.h
@@ -0,0 +1,304 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/seq2col.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+struct OverlapAddFunctor {
+  void operator()(const DeviceContext& dev_ctx, const Tensor* input,
+                  Tensor* output, size_t seq_length, size_t frame_length,
+                  size_t n_frames, size_t hop_length,
+                  bool is_grad = false) const {
+    auto numel = output->numel();
+    const auto* input_data = input->data<T>();
+    auto* output_data = output->data<T>();
+
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    if (!is_grad) {
+      math::Col2SeqFunctor<T> functor(input_data, output_data, seq_length,
+                                      frame_length, n_frames, hop_length);
+      for_range(functor);
+    } else {
+      math::Seq2ColFunctor<T> functor(input_data, output_data, seq_length,
+                                      frame_length, n_frames, hop_length);
+      for_range(functor);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OverlapAddKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    const size_t x_rank = x->dims().size();
+    const size_t out_rank = out->dims().size();
+
+    const int hop_length = ctx.Attr<int>("hop_length");
+    const int axis = ctx.Attr<int>("axis");
+    const int n_frames = (axis == 0) ? x->dims()[0] : x->dims()[x_rank - 1];
+    const int frame_length = (axis == 0) ? x->dims()[1] : x->dims()[x_rank - 2];
+    const int seq_length =
+        (axis == 0) ? out->dims()[0] : out->dims()[out_rank - 1];
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    Tensor x_(x->type());
+    x_ = *x;
+
+    framework::DDim preserved_dims;
+    if (out_rank > 2) {
+      // Save dims used to flatten both input and output tensors and restore
+      // output tensor.
+      framework::DDim x_resized_dims;
+      framework::DDim out_resized_dims;
+      if (axis == 0) {
+        preserved_dims = framework::slice_ddim(out->dims(), 1, out_rank);
+        x_resized_dims = {n_frames, frame_length,
+                          framework::product(preserved_dims)};
+        out_resized_dims = {seq_length, framework::product(preserved_dims)};
+      } else {
+        preserved_dims = framework::slice_ddim(out->dims(), 0, out_rank - 1);
+        x_resized_dims = {framework::product(preserved_dims), frame_length,
+                          n_frames};
+        out_resized_dims = {framework::product(preserved_dims), seq_length};
+      }
+      x_.Resize(x_resized_dims);
+      out->Resize(out_resized_dims);
+    }
+
+    Tensor trans_x(x_.type());
+    Tensor trans_out(out->type());
+
+    // Transpose input and output in case that axis is 0.
+    if (axis == 0) {
+      if (out_rank == 1U) {
+        trans_out = *out;
+
+        std::vector<int> perm_x{1, 0};
+        auto x_dims_vec = framework::vectorize(x_.dims());
+        for (int i = 0; i < x_.dims().size(); ++i) {
+          x_dims_vec[i] = x_.dims()[perm_x[i]];
+        }
+        trans_x.Resize(framework::make_ddim(x_dims_vec));
+        trans_x.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_x.size(), dev_ctx, x_, &trans_x,
+                                       perm_x);
+      } else {
+        std::vector<int> perm_out{1, 0};
+        auto out_dims_vec = framework::vectorize(out->dims());
+        for (int i = 0; i < out->dims().size(); ++i) {
+          out_dims_vec[i] = out->dims()[perm_out[i]];
+        }
+        trans_out.Resize(framework::make_ddim(out_dims_vec));
+        trans_out.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_out.size(), dev_ctx, *out,
+                                       &trans_out, perm_out);
+
+        std::vector<int> perm_x{2, 1, 0};
+        auto x_dims_vec = framework::vectorize(x_.dims());
+        for (int i = 0; i < x_.dims().size(); ++i) {
+          x_dims_vec[i] = x_.dims()[perm_x[i]];
+        }
+        trans_x.Resize(framework::make_ddim(x_dims_vec));
+        trans_x.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_x.size(), dev_ctx, x_, &trans_x,
+                                       perm_x);
+      }
+    } else {
+      trans_x = x_;
+      trans_out = *out;
+    }
+
+    OverlapAddFunctor<DeviceContext, T>()(dev_ctx, &trans_x, &trans_out,
+                                          seq_length, frame_length, n_frames,
+                                          hop_length, /*is_grad*/ false);
+
+    // Transpose output in case axis is 0.
+    if (axis == 0 && out_rank > 1U) {
+      std::vector<int> perm_out{1, 0};
+      TransCompute<DeviceContext, T>(perm_out.size(), dev_ctx, trans_out, out,
+                                     perm_out);
+    }
+
+    // Restore output dims when the number of dims is larger than 2.
+    if (out_rank > 2) {
+      std::vector<int64_t> restored_out_shape;
+      for (int i = 0; i < preserved_dims.size(); i++) {
+        restored_out_shape.push_back(preserved_dims[i]);
+      }
+
+      if (axis == 0) {
+        // (seq_length, ...)
+        restored_out_shape.insert(restored_out_shape.begin(), seq_length);
+      } else {
+        // (..., seq_length)
+        restored_out_shape.push_back(seq_length);
+      }
+
+      out->Resize(framework::make_ddim(restored_out_shape));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OverlapAddGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    d_x->mutable_data<T>(ctx.GetPlace());
+    const size_t d_out_rank = d_out->dims().size();
+    const size_t d_x_rank = d_x->dims().size();
+
+    const int hop_length = ctx.Attr<int>("hop_length");
+    const int axis = ctx.Attr<int>("axis");
+    const int n_frames =
+        (axis == 0) ? d_x->dims()[0] : d_x->dims()[d_x_rank - 1];
+    const int frame_length =
+        (axis == 0) ? d_x->dims()[1] : d_x->dims()[d_x_rank - 2];
+    const int seq_length =
+        (axis == 0) ? d_out->dims()[0] : d_out->dims()[d_out_rank - 1];
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    // When the number of input dims is larger than 2, it needs to copy
+    // from x to resize input into 2d and output into 3d. Morevoer, output
+    // dims will be restored at the last step.
+    Tensor d_out_(d_out->type());
+    d_out_ = *d_out;
+
+    framework::DDim preserved_dims;
+    if (d_out_rank > 2) {
+      // Save dims used to flatten both input and output tensors and restore
+      // output tensor.
+      framework::DDim d_x_resized_dims;
+      framework::DDim d_out_resized_dims;
+      if (axis == 0) {
+        preserved_dims = framework::slice_ddim(d_out_.dims(), 1, d_out_rank);
+        d_x_resized_dims = {n_frames, frame_length,
+                            framework::product(preserved_dims)};
+        d_out_resized_dims = {seq_length, framework::product(preserved_dims)};
+      } else {
+        preserved_dims =
+            framework::slice_ddim(d_out_.dims(), 0, d_out_rank - 1);
+        d_x_resized_dims = {framework::product(preserved_dims), frame_length,
+                            n_frames};
+        d_out_resized_dims = {framework::product(preserved_dims), seq_length};
+      }
+      d_x->Resize(d_x_resized_dims);
+      d_out_.Resize(d_out_resized_dims);
+    }
+
+    Tensor trans_d_x(d_x->type());
+    Tensor trans_d_out(d_out_.type());
+
+    // Transpose input and output in case that axis is 0.
+    if (axis == 0) {
+      if (d_out_rank == 1U) {
+        trans_d_out = d_out_;
+
+        std::vector<int> perm_d_x{1, 0};
+        auto d_x_dims_vec = framework::vectorize(d_x->dims());
+        for (int i = 0; i < d_x->dims().size(); ++i) {
+          d_x_dims_vec[i] = d_x->dims()[perm_d_x[i]];
+        }
+        trans_d_x.Resize(framework::make_ddim(d_x_dims_vec));
+        trans_d_x.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_d_x.size(), dev_ctx, *d_x,
+                                       &trans_d_x, perm_d_x);
+      } else {
+        std::vector<int> perm_d_out{1, 0};
+        auto d_out_dims_vec = framework::vectorize(d_out_.dims());
+        for (int i = 0; i < d_out_.dims().size(); ++i) {
+          d_out_dims_vec[i] = d_out_.dims()[perm_d_out[i]];
+        }
+        trans_d_out.Resize(framework::make_ddim(d_out_dims_vec));
+        trans_d_out.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_d_out.size(), dev_ctx, d_out_,
+                                       &trans_d_out, perm_d_out);
+
+        std::vector<int> perm_d_x{2, 1, 0};
+        auto d_x_dims_vec = framework::vectorize(d_x->dims());
+        for (int i = 0; i < d_x->dims().size(); ++i) {
+          d_x_dims_vec[i] = d_x->dims()[perm_d_x[i]];
+        }
+        trans_d_x.Resize(framework::make_ddim(d_x_dims_vec));
+        trans_d_x.mutable_data<T>(ctx.GetPlace());
+        TransCompute<DeviceContext, T>(perm_d_x.size(), dev_ctx, *d_x,
+                                       &trans_d_x, perm_d_x);
+      }
+    } else {
+      trans_d_x = *d_x;
+      trans_d_out = d_out_;
+    }
+
+    OverlapAddFunctor<DeviceContext, T>()(dev_ctx, &trans_d_out, &trans_d_x,
+                                          seq_length, frame_length, n_frames,
+                                          hop_length,
+                                          /*is_grad*/ true);
+
+    // Transpose output in case axis is 0.
+    if (axis == 0) {
+      if (d_out_rank == 1U) {
+        std::vector<int> perm_d_x{1, 0};
+        TransCompute<DeviceContext, T>(perm_d_x.size(), dev_ctx, trans_d_x, d_x,
+                                       perm_d_x);
+      } else {
+        std::vector<int> perm_d_x{2, 1, 0};
+        TransCompute<DeviceContext, T>(perm_d_x.size(), dev_ctx, trans_d_x, d_x,
+                                       perm_d_x);
+      }
+    }
+
+    // Restore output dims when the number of dims is larger than 2.
+    if (d_out_rank > 2) {
+      std::vector<int64_t> restored_d_x_shape;
+      for (int i = 0; i < preserved_dims.size(); i++) {
+        restored_d_x_shape.push_back(preserved_dims[i]);
+      }
+
+      if (axis == 0) {
+        // (n_frames, frame_length, ...)
+        restored_d_x_shape.insert(restored_d_x_shape.begin(), frame_length);
+        restored_d_x_shape.insert(restored_d_x_shape.begin(), n_frames);
+      } else {
+        // (..., frame_length, n_frames)
+        restored_d_x_shape.push_back(frame_length);
+        restored_d_x_shape.push_back(n_frames);
+      }
+
+      d_x->Resize(framework::make_ddim(restored_d_x_shape));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index bd6694abdbf76..cfe778c49121f 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -20,7 +20,9 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/p_norm_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -30,12 +32,23 @@ __device__ __forceinline__ int sgn(T val) {
   return (T(0) < val) - (val < T(0));
 }
 
+__device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) {
+  return static_cast<platform::float16>(abs(static_cast<float>(x)));
+}
 __device__ __forceinline__ float inline_abs(float x) { return abs(x); }
 __device__ __forceinline__ double inline_abs(double x) { return abs(x); }
 
+__device__ __forceinline__ int inline_sign(platform::float16 x) {
+  return sgn<platform::float16>(x);
+}
 __device__ __forceinline__ int inline_sign(float x) { return sgn<float>(x); }
 __device__ __forceinline__ int inline_sign(double x) { return sgn<double>(x); }
 
+__device__ __forceinline__ platform::float16 inline_pow(
+    platform::float16 base, platform::float16 exponent) {
+  return static_cast<platform::float16>(
+      pow(static_cast<float>(base), static_cast<float>(exponent)));
+}
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
   return pow(base, exponent);
 }
@@ -47,21 +60,23 @@ template <typename T, int BlockDim>
 __global__ void Pnorm(const T* x, const int pre,
                       const int axis_n,  // dim in axis
                       const int post, float porder, T* out_norm) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  using MT = typename details::MPTypeTrait<T>::Type;
+  typedef cub::BlockReduce<MT, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   int num = pre * post;
-  auto porder_t = static_cast<T>(porder);
-  auto porder_inv = static_cast<T>(1.0 / porder);
+  auto porder_t = static_cast<MT>(porder);
+  auto porder_inv = static_cast<MT>(1.0 / porder);
 
   for (int i = blockIdx.x; i < num; i += gridDim.x) {
     int base = (i / post) * post * axis_n + (i % post);
-    T sum = 0.0;
+    MT sum = static_cast<MT>(0.0);
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
-      const T x_ij = x[base + j * post];
+      const MT x_ij = static_cast<MT>(x[base + j * post]);
       sum += inline_pow(inline_abs(x_ij), porder_t);
     }
-    T reduce_result = BlockReduce(temp_storage).Sum(sum);
-    if (threadIdx.x == 0) out_norm[i] = inline_pow(reduce_result, porder_inv);
+    MT reduce_result = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0)
+      out_norm[i] = static_cast<T>(inline_pow(reduce_result, porder_inv));
   }
 }
 
@@ -69,18 +84,19 @@ template <typename T, int BlockDim>
 __global__ void ZeorNorm(const T* x, const int pre,
                          const int axis_n,  // dim in axis
                          const int post, T* out_norm) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  using MT = typename details::MPTypeTrait<T>::Type;
+  typedef cub::BlockReduce<MT, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   int num = pre * post;
   for (int i = blockIdx.x; i < num; i += gridDim.x) {
     int base = (i / post) * post * axis_n + (i % post);
-    T sum = 0.0;
+    MT sum = static_cast<MT>(0.0);
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
-      const T x_ij = x[base + j * post];
-      sum += static_cast<T>(x_ij != 0);
+      const MT x_ij = static_cast<MT>(x[base + j * post]);
+      sum += static_cast<MT>(static_cast<double>(x_ij) != 0);
     }
-    T reduce_result = BlockReduce(temp_storage).Sum(sum);
-    if (threadIdx.x == 0) out_norm[i] = reduce_result;
+    MT reduce_result = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) out_norm[i] = static_cast<T>(reduce_result);
   }
 }
 
@@ -172,27 +188,29 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad,
                               const float porder, const int pre,
                               const int axis_n, const int post, const T eps,
                               T* x_grad) {
+  using MT = typename details::MPTypeTrait<T>::Type;
   // dx = (x/pnorm_broadcast).pow(p-1) * norm_dy.broadcast * sign(x)
   int num = pre * post;
-  auto porder_grad = static_cast<T>(porder - 1.0f);
+  auto porder_grad = static_cast<MT>(porder - 1.0f);
   for (int i = blockIdx.x; i < num; i += gridDim.x) {
-    __shared__ T pnorm_i;
-    __shared__ T yout_i;
+    __shared__ MT pnorm_i;
+    __shared__ MT yout_i;
 
     auto base = (i / post) * post * axis_n + (i % post);
 
     if (threadIdx.x == 0) {
-      pnorm_i = x_norm[i];
-      yout_i = y_grad[i];
+      pnorm_i = static_cast<MT>(x_norm[i]);
+      yout_i = static_cast<MT>(y_grad[i]);
     }
     __syncthreads();
 
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
       int index = base + j * post;
-      const T x_ij = inline_abs(x[index]);
-      x_grad[index] = inline_pow(x_ij, porder_grad) /
-                      (inline_pow(pnorm_i, porder_grad) + eps) * yout_i *
-                      inline_sign(x[index]);
+      const MT x_ij = static_cast<MT>(inline_abs(x[index]));
+      x_grad[index] = static_cast<T>(
+          inline_pow(x_ij, porder_grad) /
+          (inline_pow(pnorm_i, porder_grad) + static_cast<MT>(eps)) * yout_i *
+          static_cast<MT>(inline_sign(x[index])));
     }
   }
 }
@@ -216,7 +234,7 @@ __global__ void InfNormGradient(const T* x, const T* x_norm, const T* y_grad,
       int index = base + j * post;
       const T x_ij = inline_abs(x[index]);
       if (x_ij == pnorm_i) {
-        x_grad[index] = inline_sign(x[index]) * yout_i;
+        x_grad[index] = static_cast<T>(inline_sign(x[index])) * yout_i;
       } else {
         x_grad[index] = static_cast<T>(0);
       }
@@ -278,7 +296,11 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(p_norm, ops::PnormCUDAKernel<CUDA, float>,
+REGISTER_OP_CUDA_KERNEL(p_norm,
+                        ops::PnormCUDAKernel<CUDA, paddle::platform::float16>,
+                        ops::PnormCUDAKernel<CUDA, float>,
                         ops::PnormCUDAKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(p_norm_grad, ops::PnormGradCUDAKernel<CUDA, float>,
-                        ops::PnormGradCUDAKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(
+    p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>,
+    ops::PnormGradCUDAKernel<CUDA, float>,
+    ops::PnormGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 3bf66c77badb9..1ace706bac6ee 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/pad_op.h"
 #include <memory>
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -170,10 +171,18 @@ REGISTER_OP_CPU_KERNEL(
     pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>,
     ops::PadKernel<paddle::platform::CPUDeviceContext, double>,
     ops::PadKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::PadKernel<paddle::platform::CPUDeviceContext,
+                   paddle::platform::complex<float>>,
+    ops::PadKernel<paddle::platform::CPUDeviceContext,
+                   paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
+                       paddle::platform::complex<float>>,
+    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
+                       paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
@@ -181,9 +190,17 @@ REGISTER_OP_CUDA_KERNEL(
     ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
     ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>);
+                   paddle::platform::float16>,
+    ops::PadKernel<paddle::platform::CUDADeviceContext,
+                   paddle::platform::complex<float>>,
+    ops::PadKernel<paddle::platform::CUDADeviceContext,
+                   paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>);
+                       paddle::platform::float16>,
+    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
+                       paddle::platform::complex<float>>,
+    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
+                       paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 8e249e72db514..cd1bdc4d60c74 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -15,7 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
+PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12,
+                             "number of threads for rpc send");
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index d3faa2c8460f2..da637dfeb237d 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -25,22 +25,26 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "Tensor, "
+             "(Tensor), "
              "the input of PSROIPoolOp. "
              "The format of input tensor is NCHW. Where N is the batch size, "
              "C is the number of input channels, "
              "H is the height of the input feature map, and "
              "W is the width. The data type can be float32 or float64");
     AddInput("ROIs",
-             "LoDTensor, "
+             "(LoDTensor), "
              "ROIs (Regions of Interest) to pool over. "
              "should be a 2-D LoDTensor of shape (num_rois, 4) "
              "given as [(x1, y1, x2, y2), ...]. "
              "where (x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates. "
              "The roi batch index can be calculated from LoD.");
+    AddInput("RoisNum",
+             "(Tensor), "
+             "The number of RoIs in each image.")
+        .AsDispensable();
     AddOutput("Out",
-              "Tensor, "
+              "(Tensor), "
               "the output of PSROIPoolOp is a 4-D Tensor with shape "
               "(num_rois, output_channels, pooled_h, pooled_w). "
               "The data type is the same as `x` ");
@@ -65,8 +69,6 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                  "the pooled output width.")
         .SetDefault(1);
     AddComment(R"Doc(
-**PSROIPool Operator,** `rois` **of this op should be a LoDTensor**
-
 Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
 position-sensitive average pooling on regions of interest specified by input, takes as 
 input N position-sensitive score maps and a list of num_rois regions of interest. 
@@ -106,7 +108,14 @@ class PSROIPoolOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
             "given as [(x1, y1, x2, y2), ...]"));
-
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
+      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The second dimension of RoisNum should "
+                            "be 1, but received dimension is %d",
+                            rois_num_dims.size()));
+    }
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
     int output_channels = ctx->Attrs().Get<int>("output_channels");
@@ -184,6 +193,7 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("psroi_pool_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     op->SetAttrMap(this->Attrs());
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
index 748b6036008f1..f69edfc1fcfec 100644
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -185,34 +185,67 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
 
     int rois_num = rois->dims()[0];
     if (rois_num == 0) return;
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(rois_batch_size, batch_size,
-                      platform::errors::InvalidArgument(
-                          "The batch size of input(ROIs) and input(X) must be "
-                          "the same but received batch size of input(ROIs) and "
-                          "input(X) is %d and %d respectively.",
-                          rois_batch_size, batch_size));
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      platform::errors::InvalidArgument(
-                          "The number of rois from input(ROIs) and its LOD "
-                          "must be the same. Received rois %d of input(ROIs) "
-                          "but the number of rois %d from its LOD is %d",
-                          rois_num, rois_num_with_lod));
-
-    // set rois batch id
+    int rois_batch_size;
     framework::Tensor rois_batch_id_list;
     rois_batch_id_list.Resize({rois_num});
     int* rois_batch_id_data =
         rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        rois_batch_id_data[i] = n;
+
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
+      auto* rois_num_data = rois_num_t->data<int>();
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size, batch_size,
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be "
+              "the same but received batch size of input(ROIs) and "
+              "input(X) is %d and %d respectively.",
+              rois_batch_size, batch_size));
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(platform::CPUPlace(), rois_num_list.data(),
+                   BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()),
+                   rois_num_data, sizeof(int) * rois_batch_size, 0);
+      int rois_num_count = 0;
+      for (int i = 0; i < rois_batch_size; ++i) {
+        rois_num_count += rois_num_list[i];
+      }
+      PADDLE_ENFORCE_EQ(
+          rois_num_count, rois_num,
+          platform::errors::InvalidArgument(
+              "the rois_num from input and RoisNum must be the same"));
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_list[n];
+      }
+    } else {
+      auto rois_lod = rois->lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size, batch_size,
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be "
+              "the same but received batch size of input(ROIs) and "
+              "input(X) is %d and %d respectively.",
+              rois_batch_size, batch_size));
+      int rois_num_with_lod = rois_lod[rois_batch_size];
+      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                        platform::errors::InvalidArgument(
+                            "The number of rois from input(ROIs) and its LOD "
+                            "must be the same. Received rois %d of input(ROIs) "
+                            "but the number of rois %d from its LOD is %d",
+                            rois_num, rois_num_with_lod));
+
+      // set rois batch id
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
       }
     }
-
     framework::Tensor rois_batch_id_list_gpu;
     framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
                           ctx.device_context(), &rois_batch_id_list_gpu);
@@ -257,14 +290,30 @@ class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
       rois_batch_id_list.Resize({rois_num});
       int* rois_batch_id_data =
           rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
+      int rois_batch_size;
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+        rois_batch_size = rois_num_t->numel();
+        std::vector<int> rois_num_list(rois_batch_size);
+        memory::Copy(platform::CPUPlace(), rois_num_list.data(),
+                     BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()),
+                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_list[n]; ++i) {
+            rois_batch_id_data[i] = n;
+          }
+          start += rois_num_list[n];
+        }
+      } else {
+        auto rois_lod = rois->lod().back();
+        rois_batch_size = rois_lod.size() - 1;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+            rois_batch_id_data[i] = n;
+          }
         }
       }
-
       framework::Tensor rois_batch_id_list_gpu;
       framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
                             ctx.device_context(), &rois_batch_id_list_gpu);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
index 4f4cb24844b8c..4d7e9ce295fc8 100644
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -40,6 +40,13 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
     int width = in_dims[3];
     int rois_num = rois->dims()[0];
 
+    PADDLE_ENFORCE_EQ(input_channels,
+                      output_channels * pooled_height * pooled_width,
+                      platform::errors::InvalidArgument(
+                          "the channels of input "
+                          "X should equal the product of "
+                          "output_channels x pooled_height x pooled_width"));
+
     auto in_stride = framework::stride(in_dims);
     auto out_stride = framework::stride(out->dims());
 
@@ -49,32 +56,52 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
     rois_batch_id_list.Resize({rois_num});
     int* rois_batch_id_data =
         rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        platform::errors::InvalidArgument("the rois_batch_size and input(X) "
-                                          "batch_size should be the same."));
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num,
-                      platform::errors::InvalidArgument(
-                          "the rois_num from input and lod must be the same"));
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      platform::errors::InvalidArgument(
-                          "the channels of input "
-                          "X should equal the product of "
-                          "output_channels x pooled_height x pooled_width"));
-
-    // calculate batch id index for each roi according to LoD
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        rois_batch_id_data[i] = n;
+    int rois_batch_size;
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
+      auto* rois_num_data = rois_num_t->data<int>();
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size, batch_size,
+          platform::errors::InvalidArgument(
+              "The batch size of rois and the batch size of images "
+              " must be the same. But received the batch size of rois is %d, "
+              "and the batch size of images is %d",
+              rois_batch_size, batch_size));
+      int rois_num_count = 0;
+      for (int i = 0; i < rois_batch_size; ++i) {
+        rois_num_count += rois_num_data[i];
+      }
+      PADDLE_ENFORCE_EQ(
+          rois_num_count, rois_num,
+          platform::errors::InvalidArgument(
+              "the rois_num from input and RoisNum must be the same"));
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_data[n];
+      }
+    } else {
+      auto rois_lod = rois->lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size, batch_size,
+          platform::errors::InvalidArgument("the rois_batch_size and input(X) "
+                                            "batch_size should be the same."));
+      int rois_num_with_lod = rois_lod[rois_batch_size];
+      PADDLE_ENFORCE_EQ(
+          rois_num_with_lod, rois_num,
+          platform::errors::InvalidArgument(
+              "the rois_num from input and lod must be the same"));
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
       }
     }
-
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     const T* input_rois = rois->data<T>();
 
@@ -93,7 +120,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
           static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
       T roi_end_h =
           static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
       // Force too small rois to be 1 x 1
       T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
       T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
@@ -172,15 +198,28 @@ class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
       rois_batch_id_list.Resize({rois_num});
       int* rois_batch_id_data =
           rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
+      int rois_batch_size;
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+        rois_batch_size = rois_num_t->numel();
+        auto* rois_num_data = rois_num_t->data<int>();
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_data[n]; ++i) {
+            rois_batch_id_data[i] = n;
+          }
+          start += rois_num_data[n];
+        }
+      } else {
+        auto rois_lod = rois->lod().back();
+        rois_batch_size = rois_lod.size() - 1;
+        // calculate batch id index for each roi according to LoD
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+            rois_batch_id_data[i] = n;
+          }
         }
       }
-
       const T* input_rois = rois->data<T>();
       const T* output_grad_data = output_grad->data<T>();
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/randperm_op_npu.cc b/paddle/fluid/operators/randperm_op_npu.cc
new file mode 100644
index 0000000000000..a16c0d905a555
--- /dev/null
+++ b/paddle/fluid/operators/randperm_op_npu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/randperm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+template <typename T>
+using kernel =
+    paddle::operators::RandpermKernel<paddle::platform::NPUDeviceContext, T>;
+
+REGISTER_OP_NPU_KERNEL(randperm, kernel<int64_t>, kernel<int>, kernel<float>,
+                       kernel<double>);
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 92e5e4a0cd120..556f1bccd1680 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -215,13 +215,7 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
                                   kSkipEagerDeletionVars), /*skip_ref_cnt_vars*/
                               true);
 
-  static std::mutex mutex;
-  std::lock_guard<std::mutex> lock(mutex);
   StepScopes scopes = CreateStepScopes(dev_ctx, scope, seq_len);
-  // TODO(gfwm2013) Function CreateStepScopes would make segmentation fault in
-  // multithreading in eval process, so we use a mutex before function
-  // CreateStepScopes to make sure that the computing process is correct. This
-  // problem will fix in next pull request.
   for (size_t i = 0; i < seq_len; ++i) {
     size_t seq_offset = reverse ? seq_len - i - 1 : i;
     VLOG(3) << "Recurrent operate at the time step " << seq_offset;
@@ -292,6 +286,11 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
 StepScopes RecurrentOp::CreateStepScopes(const platform::DeviceContext &dev_ctx,
                                          const framework::Scope &scope,
                                          size_t seq_len) const {
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> lock(mutex);
+  // TODO(baoachun) Function CreateStepScopes may lead to segmentation
+  // fault in multithreading in eval process. The performance drop of
+  // adding mutex need to be fixed.
   auto *var = scope.FindVar(Output(kStepScopes));
   PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
                                    "RecurrentOp gets empty StepScopes var"));
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index 0165cfd8b8089..17801454da2fa 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -71,10 +71,8 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
     if (input_dims == output_dims) {
       mkldnn::memory::data_type input_type =
           framework::ToMKLDNNDataType(input->type());
-      std::string key = platform::CreateKey(
-          dev_ctx, input_dims, input->format(), input->format(), input_type);
-      platform::ReorderMKLDNNHandler reorder_handler(
-          input_dims, input->type(), input_type, dev_ctx, onednn_engine, key);
+      platform::ReorderMKLDNNHandler reorder_handler(input_dims, input->type(),
+                                                     input_type, onednn_engine);
 
       auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
           input->format(), platform::to_void_cast(input->data<T>()));
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 8913642a5941f..c74f0f0e499b4 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -229,7 +229,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
     // by now we require that if the input tensor is zero shape, the target
     // shape of output must be zero
     if (in_size == 0) {
-      PADDLE_ENFORCE_EQ(
+      PADDLE_ENFORCE_LE(
           capacity, in_size,
           platform::errors::InvalidArgument(
               "The 'shape' in ReshapeOp is invalid. "
@@ -249,11 +249,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
+//    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+//      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+//                                     framework::DataLayout::kMKLDNN,
+//                                     framework::LibraryType::kMKLDNN);
+//    }
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -367,11 +367,11 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
+//    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+//      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+//                                     framework::DataLayout::kMKLDNN,
+//                                     framework::LibraryType::kMKLDNN);
+//    }
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -558,11 +558,11 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
         ctx, framework::GradVarName("Out"));
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
+//    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+//      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+//                                     framework::DataLayout::kMKLDNN,
+//                                     framework::LibraryType::kMKLDNN);
+//    }
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index a0c28ae6cba16..b6a8111592fb7 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -148,12 +149,20 @@ REGISTER_OP_CPU_KERNEL(
     roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
     ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
     ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::RollKernel<paddle::platform::CPUDeviceContext,
+                    paddle::platform::complex<float>>,
+    ops::RollKernel<paddle::platform::CPUDeviceContext,
+                    paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
+                        paddle::platform::complex<float>>,
+    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
+                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(roll)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
index 136c5c0aca8b3..a170ce2fb111d 100644
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/roll_op.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
@@ -188,9 +189,17 @@ REGISTER_OP_CUDA_KERNEL(
     roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
     ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
     ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::RollKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::complex<float>>,
+    ops::RollKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::complex<float>>,
+    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index d8ec12659f77f..dd135b89714da 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -64,6 +65,7 @@ Return the shape of the input.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
@@ -71,4 +73,6 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
                        ops::ShapeKernel<int8_t>, ops::ShapeKernel<uint8_t>,
                        ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>);
+                       ops::ShapeKernel<double>,
+                       ops::ShapeKernel<plat::complex<float>>,
+                       ops::ShapeKernel<plat::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
index fce723c78413a..c6e380a94f84d 100644
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/platform/complex.h"
 
 REGISTER_OP_CUDA_KERNEL(
     shape, paddle::operators::ShapeKernel<bool>,
@@ -21,4 +22,6 @@ REGISTER_OP_CUDA_KERNEL(
     paddle::operators::ShapeKernel<int64_t>,
     paddle::operators::ShapeKernel<float>,
     paddle::operators::ShapeKernel<double>,
-    paddle::operators::ShapeKernel<paddle::platform::float16>);
+    paddle::operators::ShapeKernel<paddle::platform::float16>,
+    paddle::operators::ShapeKernel<paddle::platform::complex<float>>,
+    paddle::operators::ShapeKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/solve_op.cc b/paddle/fluid/operators/solve_op.cc
new file mode 100644
index 0000000000000..6e89eec7493da
--- /dev/null
+++ b/paddle/fluid/operators/solve_op.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/solve_op.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+class SolveOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Solve");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Solve");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Solve");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    std::vector<int64_t> x_dims_vec =
+        paddle::framework::vectorize(ctx->GetInputDim("X"));
+    std::vector<int64_t> y_dims_vec =
+        paddle::framework::vectorize(ctx->GetInputDim("Y"));
+
+    auto x_dims_n = x_dims_vec.size();
+    auto y_dims_n = y_dims_vec.size();
+
+    PADDLE_ENFORCE_GT(x_dims_n, 1,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's dimensions of SolveOp "
+                          "should be larger than 1. But received X's "
+                          "dimensions = %d, X's shape = [%s]",
+                          x_dims_n, x_dims));
+
+    PADDLE_ENFORCE_GE(y_dims_n, 1,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Y's dimensions of SolveOp "
+                          "should be larger than or equal 1. But received Y's "
+                          "dimensions = %d, Y's shape = [%s]",
+                          y_dims_n, y_dims));
+
+    PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1],
+                      platform::errors::InvalidArgument(
+                          "The inner-most 2 dimensions of Input(X) all should "
+                          "be square matrices "
+                          "But received X's shape[-2] = %d and shape[-1] = %d.",
+                          x_dims[x_dims_n - 2], x_dims[x_dims_n - 1]));
+
+    bool x_broadcasted = false, y_broadcasted = false;
+    bool trans_x = false, trans_y = false;
+    if (x_dims_n == 1) {
+      x_dims_vec.insert(x_dims_vec.begin(), 1);
+      x_dims_n = 2;
+      x_broadcasted = true;
+    }
+
+    if (y_dims_n == 1) {
+      y_dims_vec.push_back(1);
+      y_dims_n = 2;
+      y_broadcasted = true;
+    }
+
+    size_t M, N;
+    if (trans_x) {
+      M = x_dims_vec[x_dims_n - 1];
+    } else {
+      M = x_dims_vec[x_dims_n - 2];
+    }
+    if (trans_y) {
+      N = y_dims_vec[y_dims_n - 2];
+    } else {
+      N = y_dims_vec[y_dims_n - 1];
+    }
+
+    std::vector<int64_t> new_dims;
+    if (x_dims_n >= y_dims_n) {
+      new_dims.assign(x_dims_vec.begin(), x_dims_vec.end() - 2);
+    } else {
+      new_dims.assign(y_dims_vec.begin(), y_dims_vec.end() - 2);
+    }
+    if (!x_broadcasted) {
+      new_dims.push_back(M);
+    }
+    if (!y_broadcasted) {
+      new_dims.push_back(N);
+    }
+    if (x_broadcasted && y_broadcasted) {
+      new_dims.push_back(1);
+    }
+
+    auto out_dims = framework::make_ddim(new_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    int customized_type_value =
+        framework::OpKernelType::kDefaultCustomizedTypeValue;
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library, customized_type_value);
+  }
+};
+
+class SolveOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The first input tensor of solve op.");
+    AddInput("Y", "(Tensor), The second input tensor of solve op.");
+    AddOutput("Out", "(Tensor), The output tensor of solve op.");
+    AddComment(R"DOC(
+          Solve Operator.
+          This operator is used to computes the solution of a square system of 
+          linear equations with a unique solution for input $X$ and $Y$.
+
+          The equation is:
+          $$Out = X^-1 * Y$$
+)DOC");
+  }
+};
+
+class SolveOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
+    return m;
+  }
+};
+
+class SolveGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "solve");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "solve");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "solve");
+    // reuse the linalg.solve forward output
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "solve");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+template <typename T>
+class SolveOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("solve_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Y", this->Input("Y"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    // reuse the linalg.solve forward output
+    retv->SetInput("Out", this->Output("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(solve, ops::SolveOp, ops::SolveOpMaker,
+                  ops::SolveOpInferVarType,
+                  ops::SolveOpGradMaker<paddle::framework::OpDesc>,
+                  ops::SolveOpGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(solve_grad, ops::SolveGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    solve, ops::SolveKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SolveKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    solve_grad, ops::SolveGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SolveGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/solve_op.cu b/paddle/fluid/operators/solve_op.cu
new file mode 100644
index 0000000000000..2ca0bcdd7f68b
--- /dev/null
+++ b/paddle/fluid/operators/solve_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/solve_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(solve, ops::SolveKernel<plat::CUDADeviceContext, float>,
+                        ops::SolveKernel<plat::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(solve_grad,
+                        ops::SolveGradKernel<plat::CUDADeviceContext, float>,
+                        ops::SolveGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
new file mode 100644
index 0000000000000..d55c2647c1f3a
--- /dev/null
+++ b/paddle/fluid/operators/solve_op.h
@@ -0,0 +1,732 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "Eigen/Core"
+#include "Eigen/LU"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matrix_solve.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+#include "paddle/fluid/operators/squeeze_op.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#endif
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using framework::To32BitIndex;
+
+constexpr int kMULMKLDNNINT8 = 1;
+
+struct IdentityFunctor {
+  HOSTDEVICE explicit inline IdentityFunctor() {}
+
+  template <typename U>
+  HOSTDEVICE inline U operator()(const U& x) const {
+    return x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+void ReduceSumForSolveGrad(const Tensor* input, Tensor* output,
+                           const std::vector<int>& reduce_dims, bool keep_dim,
+                           const paddle::framework::ExecutionContext& ctx) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+  auto stream = ctx.cuda_device_context().stream();
+  TensorReduce<T, T, cub::Sum, IdentityFunctor>(*input, output, reduce_dims,
+                                                static_cast<T>(0), cub::Sum(),
+                                                IdentityFunctor(), stream);
+#else
+  ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
+      input, output, reduce_dims, keep_dim, false, ctx)
+      .template apply<T>();
+#endif
+}
+
+// check the input other is vector_case or not
+static inline bool is_vector_rhs(const Tensor& input, const Tensor& other) {
+  auto x_dim = input.dims();
+  auto y_dim = other.dims();
+  auto x_dim_size = x_dim.size();
+  auto y_dim_size = y_dim.size();
+  std::vector<int64_t> x_dims_vec = paddle::framework::vectorize(x_dim);
+  std::vector<int64_t> y_dims_vec = paddle::framework::vectorize(y_dim);
+
+  std::vector<int64_t>::const_iterator f = x_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l = x_dims_vec.end() - 1;
+  std::vector<int64_t> x_dims_vec_cut(f, l);  // input.shape[:-1]
+
+  std::vector<int64_t> expected_batched_rhs_shape(x_dims_vec_cut);
+  bool vector_case =
+      y_dim_size == 1 || (x_dim_size - 1 == y_dim_size &&
+                          y_dims_vec == (expected_batched_rhs_shape));
+
+  return vector_case;
+}
+
+// unsqueeze operation helper
+static framework::DDim GetOutputShapeUnsqueeze(
+    const std::vector<int> unsqz_dims, const framework::DDim& in_dims) {
+  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+  int cur_output_size = in_dims.size();
+  std::vector<int64_t> output_shape(output_size, 0);
+
+  // Validity Check: rank range.
+  PADDLE_ENFORCE_LE(output_size, 6,
+                    platform::errors::InvalidArgument(
+                        "The output "
+                        "tensor's rank should be less than 6."));
+
+  for (int axis : unsqz_dims) {
+    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+    // Vaildity Check: the axis bound
+    PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument(
+                                  "The insert dimension value should "
+                                  "not be less than 0"));
+    PADDLE_ENFORCE_LE(cur, cur_output_size,
+                      platform::errors::InvalidArgument(
+                          "The insert dimension value shoule not be larger "
+                          "than the dimension size of input tensor"));
+    // Move old axis, and insert new axis
+    for (int i = cur_output_size; i >= cur; --i) {
+      if (output_shape[i] == 1) {
+        // Move axis
+        output_shape[i + 1] = 1;
+        output_shape[i] = 0;
+      }
+    }
+    output_shape[cur] = 1;
+    // Add the output size.
+    cur_output_size++;
+  }
+
+  // Make output shape
+  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+    if (output_shape[out_idx] == 0) {
+      output_shape[out_idx] = in_dims[in_idx++];
+    }
+  }
+
+  return framework::make_ddim(output_shape);
+}
+
+// operation like squeeze(-1)
+static void to_squeeze(const framework::ExecutionContext& context,
+                       const framework::Tensor& in, framework::Tensor* out) {
+  auto x_dims = in.dims();
+  std::vector<int> sqz_dims = {-1};
+  auto out_dims = GetOutputShape(sqz_dims, x_dims, true);
+  out->mutable_data(context.GetPlace(), in.type());
+  framework::TensorCopy(
+      in, context.GetPlace(),
+      context.template device_context<platform::DeviceContext>(), out);
+  out->Resize(out_dims);
+}
+
+// vector_case, need to operate like unsqueeze(-1)
+static void to_unsqueeze(const framework::ExecutionContext& context,
+                         const framework::Tensor& in, framework::Tensor* out) {
+  auto x_dims = in.dims();
+  std::vector<int> unsqz_dims = {-1};
+  framework::DDim out_dims = out->dims();
+  out_dims = GetOutputShapeUnsqueeze(unsqz_dims, x_dims);
+  framework::TensorCopy(
+      in, context.GetPlace(),
+      context.template device_context<platform::DeviceContext>(), out);
+  out->Resize(out_dims);
+}
+
+template <typename Container>
+Container infer_size_impl(std::vector<int64_t> a, std::vector<int64_t> b) {
+  size_t dimsA = a.size();
+  size_t dimsB = b.size();
+  size_t ndim = dimsA > dimsB ? dimsA : dimsB;
+  Container expandedSizes(ndim);
+
+  for (ptrdiff_t i = (ptrdiff_t)ndim - 1; i >= 0; --i) {
+    ptrdiff_t offset = ndim - 1 - i;
+    ptrdiff_t dimA = dimsA - 1 - offset;
+    ptrdiff_t dimB = dimsB - 1 - offset;
+    int64_t sizeA = (dimA >= 0) ? a[dimA] : 1;
+    int64_t sizeB = (dimB >= 0) ? b[dimB] : 1;
+
+    PADDLE_ENFORCE_EQ(
+        (sizeA == sizeB || sizeA == 1 || sizeB == 1), true,
+        platform::errors::PreconditionNotMet(
+            "The size of tensor a (%d) must match the size of tensor b "
+            "(%d) at non-singleton dimension %d.",
+            sizeA, sizeB, i));
+
+    expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
+  }
+  return expandedSizes;
+}
+
+// infer size for broadcast operation
+static std::vector<int64_t> infer_size(std::vector<int64_t> a,
+                                       std::vector<int64_t> b) {
+  return infer_size_impl<std::vector<int64_t>>(a, b);
+}
+
+// necessary check before expand operation
+static void expand_check(const Tensor& arg1,
+                         std::vector<int64_t> expand_shape) {
+  auto rank = arg1.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank, 1, platform::errors::InvalidArgument(
+                   "The rank of the input 'X' for expand must be positive, "
+                   "but the value received is %d.",
+                   rank));
+  PADDLE_ENFORCE_LE(
+      rank, MAX_RANK_SUPPORTED,
+      platform::errors::InvalidArgument(
+          "The rank of the input 'X' for expand must be less than "
+          "or equal to %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED, rank));
+  auto shape_size = static_cast<int>(expand_shape.size());
+  PADDLE_ENFORCE_GE(
+      shape_size, rank,
+      platform::errors::InvalidArgument(
+          "The number (%d) of elements of 'shape' for expand must be "
+          "greater than or equal to the rank (%d) of the input 'X'.",
+          shape_size, rank));
+  PADDLE_ENFORCE_LE(
+      shape_size, MAX_RANK_SUPPORTED,
+      platform::errors::InvalidArgument(
+          "The number (%d) of elements of 'shape' for expand must be "
+          "less than or equal to %d.",
+          shape_size, MAX_RANK_SUPPORTED));
+}
+
+// broadcast the batch dimensions of arg1 and arg2.
+static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) {
+  std::vector<int64_t> arg1_dims_vec =
+      paddle::framework::vectorize(arg1.dims());
+  std::vector<int64_t> arg2_dims_vec =
+      paddle::framework::vectorize(arg2.dims());
+
+  std::vector<int64_t>::const_iterator f1 = arg1_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l1 = arg1_dims_vec.end() - 2;
+  std::vector<int64_t> arg1_dims_vec_cut(f1, l1);
+
+  std::vector<int64_t>::const_iterator f2 = arg2_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l2 = arg2_dims_vec.end() - 2;
+  std::vector<int64_t> arg2_dims_vec_cut(f2, l2);
+
+  std::vector<int64_t> expand_batch_portion =
+      infer_size(arg1_dims_vec_cut, arg2_dims_vec_cut);
+
+  std::vector<int64_t> arg1_expand_size({expand_batch_portion});
+  arg1_expand_size.insert(
+      arg1_expand_size.end(),
+      {arg1_dims_vec[static_cast<int>(arg1_dims_vec.size()) - 2],
+       arg1_dims_vec[static_cast<int>(arg1_dims_vec.size()) - 1]});
+
+  std::vector<int64_t> arg2_expand_size({expand_batch_portion});
+  arg2_expand_size.insert(
+      arg2_expand_size.end(),
+      {arg2_dims_vec[static_cast<int>(arg2_dims_vec.size()) - 2],
+       arg2_dims_vec[static_cast<int>(arg2_dims_vec.size()) - 1]});
+
+  return std::make_tuple(arg1_expand_size, arg2_expand_size);
+}
+
+template <int Rank, typename T, typename DeviceContext>
+void tensor_expand(const framework::ExecutionContext& context,
+                   const Tensor& arg1, Tensor* out0,
+                   std::vector<int64_t> expand_size) {
+  auto in_dims = arg1.dims();
+  auto expand_shape = expand_size;
+  auto vec_in_dims = framework::vectorize<int>(in_dims);
+  auto diff = expand_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    PADDLE_ENFORCE_NE(
+        expand_shape[i], 0,
+        platform::errors::InvalidArgument("The expanded size cannot be zero."));
+    if (i < diff) {
+      PADDLE_ENFORCE_GT(
+          expand_shape[i], 0,
+          platform::errors::InvalidArgument(
+              "The expanded size (%d) for non-existing dimensions must be "
+              "positive for expand operation.",
+              expand_shape[i]));
+      repeat_times[i] = expand_shape[i];
+    } else if (expand_shape[i] > 0) {
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i], expand_shape[i],
+            platform::errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in shape for expand operation.",
+                vec_in_dims[i], expand_shape[i]));
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = expand_shape[i];
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          expand_shape[i], -1,
+          platform::errors::InvalidArgument(
+              "When the value in shape is negative for expand_v2 op, "
+              "only -1 is supported, but the value received is %d.",
+              expand_shape[i]));
+      repeat_times[i] = 1;
+    }
+  }
+
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+  framework::DDim out_dims(new_in_dims);
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    out_dims[i] *= repeat_times[i];
+  }
+
+  out0->Resize(out_dims);
+  auto x = EigenTensor<T, Rank>::From(arg1, new_in_dims);
+  out0->mutable_data<T>(context.GetPlace());
+  auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  // use 32-bit index to speed up
+  bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
+  if (use_32bit_index) {
+    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
+  } else {
+    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                 bcast_dims);
+  }
+}
+
+template <typename DeviceContext, typename T>
+static void linalg_solve(const framework::ExecutionContext& context,
+                         const framework::Tensor* x, const framework::Tensor* y,
+                         framework::Tensor* out) {
+  out->mutable_data<T>(context.GetPlace());
+
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+  math::MatrixSolveFunctor<DeviceContext, T> mat_solve;
+
+  // input y can be vector or matrix
+  // but need to be unsqueezed if y is a vector
+  bool is_vector = false;
+  is_vector = is_vector_rhs(*x, *y);
+
+  Tensor tmp_y;
+  if (is_vector) {
+    tmp_y.mutable_data(context.GetPlace(), y->type());
+    to_unsqueeze(context, *y, &tmp_y);
+  } else {
+    tmp_y.Resize(y->dims());
+    tmp_y.mutable_data(context.GetPlace(), y->type());
+    framework::TensorCopy(
+        *y, context.GetPlace(),
+        context.template device_context<platform::DeviceContext>(), &tmp_y);
+  }
+
+  Tensor tmp_x;
+  tmp_x.Resize(x->dims());
+  tmp_x.mutable_data(context.GetPlace(), x->type());
+  framework::TensorCopy(
+      *x, context.GetPlace(),
+      context.template device_context<platform::DeviceContext>(), &tmp_x);
+
+  std::vector<int64_t> x_broadcast_dims;
+  std::vector<int64_t> y_broadcast_dims;
+  std::tie(x_broadcast_dims, y_broadcast_dims) =
+      _broadcast_batch_dims(tmp_x, tmp_y);
+
+  expand_check(tmp_x, x_broadcast_dims);
+  expand_check(tmp_y, y_broadcast_dims);
+
+  Tensor tmp_x_bc;
+  Tensor tmp_y_bc;
+  auto tmp_x_rank = tmp_x.dims().size();
+  auto tmp_y_rank = tmp_y.dims().size();
+
+  auto rank_0 = std::max(tmp_x_rank, static_cast<int>(x_broadcast_dims.size()));
+  switch (rank_0) {
+    case 1:
+      tensor_expand<1, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
+                                         x_broadcast_dims);
+      break;
+    case 2:
+      tensor_expand<2, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
+                                         x_broadcast_dims);
+      break;
+    case 3:
+      tensor_expand<3, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
+                                         x_broadcast_dims);
+      break;
+    case 4:
+      tensor_expand<4, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
+                                         x_broadcast_dims);
+      break;
+    case 5:
+      tensor_expand<5, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
+                                         x_broadcast_dims);
+      break;
+    case 6:
+      tensor_expand<6, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
+                                         x_broadcast_dims);
+      break;
+  }
+
+  auto rank_1 = std::max(tmp_y_rank, static_cast<int>(y_broadcast_dims.size()));
+  switch (rank_1) {
+    case 1:
+      tensor_expand<1, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
+                                         y_broadcast_dims);
+      break;
+    case 2:
+      tensor_expand<2, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
+                                         y_broadcast_dims);
+      break;
+    case 3:
+      tensor_expand<3, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
+                                         y_broadcast_dims);
+      break;
+    case 4:
+      tensor_expand<4, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
+                                         y_broadcast_dims);
+      break;
+    case 5:
+      tensor_expand<5, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
+                                         y_broadcast_dims);
+      break;
+    case 6:
+      tensor_expand<6, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
+                                         y_broadcast_dims);
+      break;
+  }
+
+  auto x_dim = x->dims();
+  auto y_dim = y->dims();
+  auto x_dim_size = x_dim.size();
+  auto y_dim_size = y_dim.size();
+
+  if (is_vector) {                 // vector case
+    out->Resize(tmp_y_bc.dims());  // out.unsqueeze(-1)
+    mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
+
+    Tensor out_tmp;
+    out_tmp.Resize(out->dims());
+    out_tmp = *out;
+    to_squeeze(context, out_tmp, out);  // out.squeeze(-1)
+  } else {
+    PADDLE_ENFORCE_EQ(
+        x_dim[x_dim_size - 1], y_dim[y_dim_size - 2],
+        platform::errors::InvalidArgument(
+            "Matrix X1 with dimension greater than 2 and any matrix Y1,"
+            "the matrix X1's width must be equal with matrix Y1's "
+            "height. But received X's shape = [%s], X1's shape = [%s], X1's "
+            "width = %s; Y's shape = [%s], Y1's shape = [%s], Y1's height = "
+            "%s.",
+            x_dim, x_dim, x_dim[x_dim_size - 1], y_dim, y_dim,
+            y_dim[y_dim_size - 2]));
+    mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
+  }
+}
+
+// for TransposeNormal
+static std::vector<int> getNewAxis(const int b_rank) {
+  std::vector<int> axis_1 = {0};
+  std::vector<int> axis_2 = {1, 0};
+  std::vector<int> axis_3 = {0, 2, 1};
+  std::vector<int> axis_4 = {0, 1, 3, 2};
+  std::vector<int> axis_5 = {0, 1, 2, 4, 3};
+  std::vector<int> axis_6 = {0, 1, 2, 3, 5, 4};
+  std::vector<int> axis_7 = {0, 1, 2, 3, 4, 6, 5};
+  std::vector<int> axis_8 = {0, 1, 2, 3, 4, 5, 7, 6};
+  std::vector<int> axis_9 = {0, 1, 2, 3, 4, 5, 6, 8, 7};
+  switch (b_rank) {
+    case 1:
+      return axis_1;
+      break;
+    case 2:
+      return axis_2;
+      break;
+    case 3:
+      return axis_3;
+      break;
+    case 4:
+      return axis_4;
+      break;
+    case 5:
+      return axis_5;
+      break;
+    case 6:
+      return axis_6;
+      break;
+    case 7:
+      return axis_7;
+      break;
+    case 8:
+      return axis_8;
+      break;
+    default:
+      return axis_9;
+  }
+}
+
+// for Resize
+static std::vector<int64_t> getNewDimsVec(const DDim& b_dims) {
+  std::vector<int64_t> b_dims_vec = paddle::framework::vectorize(b_dims);
+  int size = b_dims_vec.size();
+  if (size >= 2) {
+    // swap the last 2 elements in b_dims_vec
+    int64_t temp = b_dims_vec[size - 1];
+    b_dims_vec[size - 1] = b_dims_vec[size - 2];
+    b_dims_vec[size - 2] = temp;
+    return b_dims_vec;
+  }
+  PADDLE_ENFORCE_NE(
+      b_dims_vec.empty(), true,
+      platform::errors::PreconditionNotMet(
+          "The size of tensor b must not be %d after getting new dims", 0));
+  // if b_dims_vec.size() == 1, just retun original vec
+  return b_dims_vec;
+}
+
+template <typename DeviceContext, typename T>
+class SolveKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::Tensor>("X");
+    const auto* y = context.Input<framework::Tensor>("Y");
+    Tensor* out = context.Output<framework::Tensor>("Out");
+    linalg_solve<DeviceContext, T>(context, x, y, out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SolveGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    // reuse the linalg.solve forward output
+    auto* out = ctx.Input<framework::Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    bool is_vector = false;
+    is_vector = is_vector_rhs(*input, *y);
+
+    Tensor tmp_y;
+    if (is_vector) {
+      tmp_y.mutable_data(ctx.GetPlace(), y->type());
+      to_unsqueeze(ctx, *y, &tmp_y);
+    } else {
+      tmp_y.Resize(y->dims());
+      tmp_y.mutable_data(ctx.GetPlace(), y->type());
+      framework::TensorCopy(
+          *y, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &tmp_y);
+    }
+
+    Tensor tmp_x;
+    tmp_x.Resize(input->dims());
+    tmp_x.mutable_data(ctx.GetPlace(), input->type());
+    framework::TensorCopy(
+        *input, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+
+    std::vector<int64_t> x_broadcast_dims;
+    std::vector<int64_t> y_broadcast_dims;
+    std::tie(x_broadcast_dims, y_broadcast_dims) =
+        _broadcast_batch_dims(tmp_x, tmp_y);
+
+    // tmp_dx
+    Tensor tmp_dx;
+    tmp_dx.Resize(framework::make_ddim(x_broadcast_dims));
+    tmp_dx.mutable_data<T>(ctx.GetPlace());
+
+    // tmp_dy
+    Tensor tmp_dy;
+    tmp_dy.Resize(framework::make_ddim(y_broadcast_dims));
+    tmp_dy.mutable_data<T>(ctx.GetPlace());
+
+    Tensor tmp_input(input->type());
+    const auto& new_dims_vec = getNewDimsVec(input->dims());
+    tmp_input.Resize(framework::make_ddim(new_dims_vec));
+    tmp_input.mutable_data<T>(ctx.GetPlace());
+    math::TransposeNormal<DeviceContext, T> trans;
+    std::vector<int> new_axis = getNewAxis(input->dims().size());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    trans(dev_ctx, *input, &tmp_input, new_axis);
+
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      // reuse linalg_solve forward logics to get tmp_dy
+      linalg_solve<DeviceContext, T>(ctx, &tmp_input, dout, &tmp_dy);
+    }
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // to get dx
+      auto blas = math::GetBlas<DeviceContext, T>(ctx);
+      if (input->dims().size() == 2 && y->dims().size() == 2) {
+        auto mat_dim_a1 = math::CreateMatrixDescriptor(tmp_dy.dims(), 0, false);
+        auto mat_dim_b1 = math::CreateMatrixDescriptor(out->dims(), 0, true);
+        blas.MatMul(tmp_dy, mat_dim_a1, *out, mat_dim_b1, T(-1), &tmp_dx, T(0));
+      } else if (is_vector_rhs(*input, *y)) {
+        Tensor tmp_dy_;
+        tmp_dy_.mutable_data(ctx.GetPlace(), y->type());
+        to_unsqueeze(ctx, tmp_dy, &tmp_dy_);
+
+        Tensor tmp_out_;
+        tmp_out_.mutable_data(ctx.GetPlace(), out->type());
+        to_unsqueeze(ctx, *out, &tmp_out_);
+
+        auto mat_dim_a1 =
+            math::CreateMatrixDescriptor(tmp_dy_.dims(), 0, false);
+        auto mat_dim_b1 =
+            math::CreateMatrixDescriptor(tmp_out_.dims(), 0, true);
+        blas.MatMul(tmp_dy_, mat_dim_a1, tmp_out_, mat_dim_b1, T(-1), &tmp_dx,
+                    T(0));
+      } else {
+        auto mat_dim_a1 = math::CreateMatrixDescriptor(tmp_dy.dims(), 0, false);
+        auto mat_dim_b1 = math::CreateMatrixDescriptor(out->dims(), 0, true);
+        blas.MatMul(tmp_dy, mat_dim_a1, *out, mat_dim_b1, T(-1), &tmp_dx, T(0));
+      }
+    }
+
+    if (y->dims() != tmp_dy.dims()) {
+      Tensor dy_help;
+      dy_help.Resize(tmp_dy.dims());
+      dy_help.mutable_data(ctx.GetPlace(), tmp_dy.type());
+      framework::TensorCopy(
+          tmp_dy, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &dy_help);
+
+      // get dims
+      std::vector<std::int64_t> x_dims = vectorize(input->dims());
+      std::vector<std::int64_t> y_dims = vectorize(y->dims());
+      std::vector<std::int64_t> dout_dims = vectorize(dout->dims());
+
+      if (is_vector_rhs(*input, *y)) {
+        dout_dims.push_back(1);
+      }
+
+      int y_ndim = y_dims.size();
+      int ndim = dout_dims.size();
+
+      const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+      std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+      std::fill(dy_broadcast_dims.data(),
+                dy_broadcast_dims.data() + ndim - y_ndim, 1);
+      std::copy(y_dims.data(), y_dims.data() + y_ndim,
+                dy_broadcast_dims.data() + ndim - y_ndim);
+
+      std::vector<int> dy_reduce_dims;
+      for (int idx = 0; idx <= ndim - 3; idx++) {
+        if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+          dy_reduce_dims.push_back(idx);
+        }
+      }
+      // reduce sum to get grad by ReduceSum
+      if (dy) {
+        if (dy_reduce_dims.empty()) {
+          *dy = std::move(dy_help);
+        } else {
+          bool keep_dim = true;
+          if (dy_help.dims().size() != dy->dims().size()) {
+            keep_dim = false;
+          }
+          ReduceSumForSolveGrad<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
+                                                  keep_dim, ctx);
+        }
+        dy->Resize(y->dims());
+      }
+    } else {
+      framework::TensorCopy(
+          tmp_dy, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), dy);
+    }
+
+    if (input->dims() != tmp_dx.dims()) {
+      Tensor dx_help;
+      dx_help.Resize(tmp_dx.dims());
+      dx_help.mutable_data(ctx.GetPlace(), tmp_dx.type());
+      framework::TensorCopy(
+          tmp_dx, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &dx_help);
+
+      // get dims
+      std::vector<std::int64_t> x_dims = vectorize(input->dims());
+      std::vector<std::int64_t> y_dims = vectorize(y->dims());
+
+      int x_ndim = x_dims.size();
+      int ndim = x_broadcast_dims.size();
+
+      const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
+      std::vector<std::int64_t> dx_broadcast_dims(ndim);
+
+      std::fill(dx_broadcast_dims.data(),
+                dx_broadcast_dims.data() + ndim - x_ndim, 1);
+      std::copy(x_dims.data(), x_dims.data() + x_ndim,
+                dx_broadcast_dims.data() + ndim - x_ndim);
+
+      std::vector<int> dx_reduce_dims;
+      for (int idx = 0; idx <= ndim - 3; idx++) {
+        if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+          dx_reduce_dims.push_back(idx);
+        }
+      }
+      // reduce sum to get grad by ReduceSum
+      if (dx) {
+        dx->mutable_data<T>(ctx.GetPlace());
+        if (dx_reduce_dims.empty()) {
+          *dx = std::move(dx_help);
+        } else {
+          bool keep_dim = true;
+          if (dx_help.dims().size() != dx->dims().size()) {
+            keep_dim = false;
+          }
+          ReduceSumForSolveGrad<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
+                                                  keep_dim, ctx);
+        }
+        dx->Resize(input->dims());
+      }
+    } else {
+      framework::TensorCopy(
+          tmp_dx, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), dx);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
new file mode 100644
index 0000000000000..fb50702233b3b
--- /dev/null
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -0,0 +1,870 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/spectral_op.h"
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/complex.h"
+
+#if defined(PADDLE_WITH_ONEMKL)
+#include <mkl_dfti.h>
+#elif defined(PADDLE_WITH_POCKETFFT)
+#include "extern_pocketfft/pocketfft_hdronly.h"
+#endif
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+// FFTC2C
+class FFTC2COpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), the input tensor of fft_c2c op.");
+    AddOutput("Out", "(Tensor), the output tensor of fft_c2c op.");
+    AddAttr<std::vector<int64_t>>("axes",
+                                  "std::vector<int64_t>, the fft axes.");
+    AddAttr<std::string>("normalization",
+                         "fft_norm_type, the fft normalization type.");
+    AddAttr<bool>("forward", "bool, the fft direction.");
+    AddComment(R"DOC(
+      Compute complex to complex FFT.
+    )DOC");
+  }
+};
+
+class FFTC2COp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_c2c");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2c");
+    const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
+    const auto x_dim = ctx->GetInputDim("X");
+    for (size_t i = 0; i < axes.size(); i++) {
+      PADDLE_ENFORCE_GT(x_dim[axes[i]], 0,
+                        platform::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
+    }
+    ctx->ShareDim("X", /*->*/ "Out");  // only for c2c
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    const auto kernel_dtype = framework::ToRealType(in_dtype);
+    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FFTC2CGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("fft_c2c_grad");
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class FFTC2CGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    const auto out_grad_name = framework::GradVarName("Out");
+    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
+                   "fft_c2c_grad");
+    const auto x_grad_name = framework::GradVarName("X");
+    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
+                   "fft_c2c_grad");
+
+    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    const auto kernel_dtype = framework::ToRealType(in_dtype);
+    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
+  }
+};
+
+// FFTR2C
+class FFTR2COpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), the input tensor of fft_r2c op.");
+    AddOutput("Out", "(Tensor), the output tensor of fft_r2c op.");
+    AddAttr<std::vector<int64_t>>("axes",
+                                  "std::vector<int64_t>, the fft axes.");
+    AddAttr<std::string>("normalization",
+                         "fft_norm_type, the fft normalization type.");
+    AddAttr<bool>("forward", "bool, the fft direction.");
+    AddAttr<bool>("onesided", "bool, perform onesided fft.");
+    AddComment(R"DOC(
+      Compute real to complex FFT.
+    )DOC");
+  }
+};
+
+class FFTR2COp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_r2c");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_r2c");
+    const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
+    const auto x_dim = ctx->GetInputDim("X");
+    for (size_t i = 0; i < axes.size() - 1L; i++) {
+      PADDLE_ENFORCE_GT(x_dim[axes[i]], 0,
+                        platform::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
+    }
+
+    const bool onesided = ctx->Attrs().Get<bool>("onesided");
+    if (!onesided) {
+      ctx->ShareDim("X", /*->*/ "Out");
+    } else {
+      framework::DDim out_dim(ctx->GetInputDim("X"));
+      const int64_t last_fft_axis = axes.back();
+      const int64_t last_fft_dim_size = out_dim.at(last_fft_axis);
+      out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1;
+      ctx->SetOutputDim("Out", out_dim);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(in_dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FFTR2CGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("fft_r2c_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class FFTR2CGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    const auto out_grad_name = framework::GradVarName("Out");
+    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
+                   "fft_r2c_grad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_r2c_grad");
+
+    const auto x_grad_name = framework::GradVarName("X");
+    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
+                   "fft_r2c_grad");
+
+    ctx->ShareDim("X", /*->*/ x_grad_name);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    const auto kernel_dtype = framework::ToRealType(in_dtype);
+    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
+  }
+};
+
+// FFTC2R
+class FFTC2ROpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), the input tensor of fft_c2r op.");
+    AddOutput("Out", "(Tensor), the output tensor of fft_c2r op.");
+    AddAttr<std::vector<int64_t>>("axes",
+                                  "std::vector<int64_t>, the fft axes.");
+    AddAttr<std::string>("normalization",
+                         "fft_norm_type, the fft normalization type.");
+    AddAttr<bool>("forward", "bool, the fft direction.");
+    AddAttr<int64_t>(
+        "last_dim_size", "int",
+        "Length of the transformed "
+        "axis of the output. For n output points, last_dim_size//2 + 1 input"
+        " points are necessary. If the input is longer than this,"
+        " it is cropped. If it is shorter than this, it is padded"
+        " with zeros. If last_dim_size is not given, it is taken to be 2*(m-1)"
+        " where m is the length of the input along the axis "
+        "specified by axis.")
+        .SetDefault(0L);
+    AddComment(R"DOC(
+      Compute complex to complex FFT.
+    )DOC");
+  }
+};
+
+class FFTC2ROp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_c2r");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2r");
+
+    const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
+    const auto x_dim = ctx->GetInputDim("X");
+    for (size_t i = 0; i < axes.size() - 1L; i++) {
+      PADDLE_ENFORCE_GT(x_dim[axes[i]], 0,
+                        platform::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
+    }
+
+    const int64_t last_dim_size = ctx->Attrs().Get<int64_t>("last_dim_size");
+    framework::DDim out_dim(ctx->GetInputDim("X"));
+    const int64_t last_fft_axis = axes.back();
+    if (last_dim_size == 0) {
+      const int64_t last_fft_dim_size = out_dim.at(last_fft_axis);
+      const int64_t fft_n_point = (last_fft_dim_size - 1) * 2;
+      PADDLE_ENFORCE_GT(fft_n_point, 0,
+                        platform::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", fft_n_point));
+      out_dim.at(last_fft_axis) = fft_n_point;
+    } else {
+      PADDLE_ENFORCE_GT(last_dim_size, 0,
+                        platform::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", last_dim_size));
+      out_dim.at(last_fft_axis) = last_dim_size;
+    }
+    ctx->SetOutputDim("Out", out_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    const auto kernel_dtype = framework::ToRealType(in_dtype);
+    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FFTC2RGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("fft_c2r_grad");
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class FFTC2RGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    const auto out_grad_name = framework::GradVarName("Out");
+    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
+                   "fft_c2r_grad");
+
+    const auto x_grad_name = framework::GradVarName("X");
+    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
+                   "fft_c2r_grad");
+
+    const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
+
+    const auto out_grad_dim = ctx->GetInputDim(out_grad_name);
+    framework::DDim x_grad_dim(out_grad_dim);
+    const int64_t last_fft_axis = axes.back();
+    const int64_t last_fft_dim_size = x_grad_dim.at(last_fft_axis);
+    x_grad_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1;
+    ctx->SetOutputDim(x_grad_name, x_grad_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(in_dtype, ctx.GetPlace());
+  }
+};
+
+// common functions
+FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
+  if (norm.empty() || norm == "backward") {
+    return forward ? FFTNormMode::none : FFTNormMode::by_n;
+  }
+
+  if (norm == "forward") {
+    return forward ? FFTNormMode::by_n : FFTNormMode::none;
+  }
+
+  if (norm == "ortho") {
+    return FFTNormMode::by_sqrt_n;
+  }
+
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "FFT norm string must be 'forward' or 'backward' or 'ortho', "
+      "received %s",
+      norm));
+}
+
+// FFT Functors
+#if defined(PADDLE_WITH_ONEMKL)
+
+namespace {
+static inline void MKL_DFTI_CHECK(MKL_INT status) {
+  if (status && !DftiErrorClass(status, DFTI_NO_ERROR)) {
+    PADDLE_THROW(platform::errors::External(DftiErrorMessage(status)));
+  }
+}
+
+struct DftiDescriptorDeleter {
+  void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
+    if (handle != nullptr) {
+      MKL_DFTI_CHECK(DftiFreeDescriptor(&handle));
+    }
+  }
+};
+
+class DftiDescriptor {
+ public:
+  void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
+            MKL_LONG signal_ndim, MKL_LONG* sizes) {
+    if (desc_ != nullptr) {
+      PADDLE_THROW(platform::errors::AlreadyExists(
+          "DFT DESCRIPTOR can only be initialized once."));
+    }
+    DFTI_DESCRIPTOR* raw_desc;
+    if (signal_ndim == 1) {
+      MKL_DFTI_CHECK(
+          DftiCreateDescriptor(&raw_desc, precision, signal_type, 1, sizes[0]));
+    } else {
+      MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type,
+                                          signal_ndim, sizes));
+    }
+    desc_.reset(raw_desc);
+  }
+
+  DFTI_DESCRIPTOR* get() const {
+    if (desc_ == nullptr) {
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "DFTI DESCRIPTOR has not been initialized."));
+    }
+    return desc_.get();
+  }
+
+ private:
+  std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
+};
+
+DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
+                             const framework::proto::VarType::Type& out_dtype,
+                             const framework::DDim& in_strides,
+                             const framework::DDim& out_strides,
+                             const std::vector<int>& signal_sizes,
+                             FFTNormMode normalization, bool forward) {
+  const DFTI_CONFIG_VALUE precision = [&] {
+    switch (in_dtype) {
+      case framework::proto::VarType::FP32:
+        return DFTI_SINGLE;
+      case framework::proto::VarType::COMPLEX64:
+        return DFTI_SINGLE;
+      case framework::proto::VarType::FP64:
+        return DFTI_DOUBLE;
+      case framework::proto::VarType::COMPLEX128:
+        return DFTI_DOUBLE;
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Input data type should be FP32, FP64, COMPLEX64 or COMPLEX128."));
+    }
+  }();
+
+  // C2C, R2C, C2R
+  const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
+  const DFTI_CONFIG_VALUE domain =
+      (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
+
+  // const bool complex_input = framework::IsComplexType(in_dtype);
+  // const bool complex_output = framework::IsComplexType(out_dtype);
+  // const DFTI_CONFIG_VALUE domain = [&] {
+  //   if (forward) {
+  //     return complex_input ? DFTI_COMPLEX : DFTI_REAL;
+  //   } else {
+  //     return complex_output ? DFTI_COMPLEX : DFTI_REAL;
+  //   }
+  // }();
+
+  DftiDescriptor descriptor;
+  std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
+  const MKL_LONG signal_ndim = fft_sizes.size() - 1;
+  descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
+
+  // placement inplace or not inplace
+  MKL_DFTI_CHECK(
+      DftiSetValue(descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));
+
+  // number of transformations
+  const MKL_LONG batch_size = fft_sizes[0];
+  MKL_DFTI_CHECK(
+      DftiSetValue(descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
+
+  // input & output distance
+  const MKL_LONG idist = in_strides[0];
+  const MKL_LONG odist = out_strides[0];
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_DISTANCE, odist));
+
+  // input & output stride
+  std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
+  std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
+  for (MKL_LONG i = 1; i <= signal_ndim; i++) {
+    mkl_in_stride[i] = in_strides[i];
+    mkl_out_stride[i] = out_strides[i];
+  }
+  MKL_DFTI_CHECK(
+      DftiSetValue(descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
+  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_STRIDES,
+                              mkl_out_stride.data()));
+
+  // conjugate even storage
+  if (!(fft_type == FFTTransformType::C2C)) {
+    MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE,
+                                DFTI_COMPLEX_COMPLEX));
+  }
+
+  MKL_LONG signal_numel =
+      std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
+                      std::multiplies<MKL_LONG>());
+  if (normalization != FFTNormMode::none) {
+    const double scale =
+        ((normalization == FFTNormMode::by_sqrt_n)
+             ? 1.0 / std::sqrt(static_cast<double>(signal_numel))
+             : 1.0 / static_cast<double>(signal_numel));
+    const auto scale_direction = [&]() {
+      if (fft_type == FFTTransformType::R2C ||
+          (fft_type == FFTTransformType::C2C && forward)) {
+        return DFTI_FORWARD_SCALE;
+      } else {
+        // (fft_type == FFTTransformType::C2R ||
+        //          (fft_type == FFTTransformType::C2C && !forward))
+        return DFTI_BACKWARD_SCALE;
+      }
+    }();
+    MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), scale_direction, scale));
+  }
+
+  // commit the descriptor
+  MKL_DFTI_CHECK(DftiCommitDescriptor(descriptor.get()));
+  return descriptor;
+}
+
+// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
+template <typename DeviceContext, typename Ti, typename To>
+void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
+              const std::vector<int64_t>& axes, FFTNormMode normalization,
+              bool forward) {
+  const framework::DDim& in_sizes = x->dims();
+  const int ndim = in_sizes.size();
+  const int signal_ndim = axes.size();
+  const int batch_ndim = ndim - signal_ndim;
+  const framework::DDim& out_sizes = out->dims();
+
+  // make a dim permutation
+  std::vector<int> dim_permute(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), 0);
+  std::vector<bool> is_transformed_dim(ndim, false);
+  for (const auto& d : axes) {
+    is_transformed_dim[d] = true;
+  }
+  const auto batch_end =
+      std::partition(dim_permute.begin(), dim_permute.end(),
+                     [&](size_t axis) { return !is_transformed_dim[axis]; });
+  std::copy(axes.cbegin(), axes.cend(), batch_end);
+
+  // transpose input according to that permutation
+  framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
+  std::vector<int64_t> transposed_input_shape_ =
+      framework::vectorize(transposed_input_shape);
+  framework::Tensor transposed_input;
+  transposed_input.Resize(transposed_input_shape);
+  const auto place = ctx.GetPlace();
+  transposed_input.mutable_data<Ti>(place);
+  TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
+                                               dim_permute);
+
+  // make an collapsed input: collapse batch axes for input
+  const int batch_size = std::accumulate(
+      transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
+      1L, std::multiplies<int64_t>());
+  std::vector<int> collapsed_input_shape_(1 + signal_ndim);
+  collapsed_input_shape_[0] = batch_size;
+  std::copy(transposed_input_shape_.begin() + batch_ndim,
+            transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
+  const framework::DDim collapsed_input_shape =
+      framework::make_ddim(collapsed_input_shape_);
+  transposed_input.Resize(collapsed_input_shape);
+  framework::Tensor& collapsed_input = transposed_input;
+
+  // make a collapsed output
+  std::vector<int> collapsed_output_shape_(1 + signal_ndim);
+  collapsed_output_shape_[0] = batch_size;
+  for (int i = 0; i < signal_ndim; i++) {
+    collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
+  }
+  const framework::DDim collapsed_output_shape =
+      framework::make_ddim(collapsed_output_shape_);
+  framework::Tensor collapsed_output;
+  collapsed_output.Resize(collapsed_output_shape);
+  collapsed_output.mutable_data(place, out->type());
+
+  // signal sizes
+  std::vector<int> signal_sizes(1 + signal_ndim);
+  signal_sizes[0] = batch_size;
+  for (int i = 0; i < signal_ndim; i++) {
+    signal_sizes[1 + i] =
+        std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
+  }
+
+  // input & output stride
+  const framework::DDim input_stride = framework::stride(collapsed_input_shape);
+  const framework::DDim output_stride =
+      framework::stride(collapsed_output_shape);
+
+  // make a DFTI_DESCRIPTOR
+  DftiDescriptor desc =
+      _plan_mkl_fft(x->type(), out->type(), input_stride, output_stride,
+                    signal_sizes, normalization, forward);
+
+  const FFTTransformType fft_type = GetFFTTransformType(x->type(), out->type());
+  if (fft_type == FFTTransformType::C2R && forward) {
+    framework::Tensor collapsed_input_conj(collapsed_input.type());
+    collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
+                                          ctx.GetPlace());
+    // conjugate the input
+    platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
+    math::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
+                                  collapsed_input.numel(),
+                                  collapsed_input_conj.data<Ti>());
+    for_range(functor);
+    MKL_DFTI_CHECK(DftiComputeBackward(desc.get(),
+                                       collapsed_input_conj.data<void>(),
+                                       collapsed_output.data<void>()));
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    framework::Tensor collapsed_output_conj(collapsed_output.type());
+    collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
+                                           ctx.GetPlace());
+    MKL_DFTI_CHECK(DftiComputeForward(desc.get(), collapsed_input.data<void>(),
+                                      collapsed_output_conj.data<void>()));
+    // conjugate the output
+    platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
+    math::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
+                                  collapsed_output.numel(),
+                                  collapsed_output.data<To>());
+    for_range(functor);
+  } else {
+    if (forward) {
+      MKL_DFTI_CHECK(DftiComputeForward(desc.get(),
+                                        collapsed_input.data<void>(),
+                                        collapsed_output.data<void>()));
+    } else {
+      MKL_DFTI_CHECK(DftiComputeBackward(desc.get(),
+                                         collapsed_input.data<void>(),
+                                         collapsed_output.data<void>()));
+    }
+  }
+
+  // resize for the collapsed output
+  framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
+  collapsed_output.Resize(transposed_output_shape);
+  framework::Tensor& transposed_output = collapsed_output;
+
+  // reverse the transposition
+  std::vector<int> reverse_dim_permute(ndim);
+  for (int i = 0; i < ndim; i++) {
+    reverse_dim_permute[dim_permute[i]] = i;
+  }
+  TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
+                                               out, reverse_dim_permute);
+}
+}  // anonymous namespace
+
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                 normalization, forward);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                 normalization, forward);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    if (axes.size() > 1) {
+      const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
+      Tensor temp;
+      temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
+
+      FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
+      c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
+
+      const std::vector<int64_t> new_axes{axes.back()};
+      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
+                                                   normalization, forward);
+    } else {
+      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                   normalization, forward);
+    }
+  }
+};
+
+#elif defined(PADDLE_WITH_POCKETFFT)
+
+namespace {
+template <typename T>
+T compute_factor(int64_t size, FFTNormMode normalization) {
+  constexpr auto one = static_cast<T>(1);
+  switch (normalization) {
+    case FFTNormMode::none:
+      return one;
+    case FFTNormMode::by_n:
+      return one / static_cast<T>(size);
+    case FFTNormMode::by_sqrt_n:
+      return one / std::sqrt(static_cast<T>(size));
+  }
+  PADDLE_THROW(
+      platform::errors::InvalidArgument("Unsupported normalization type"));
+}
+}  // anonymous namespace
+
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = typename Ti::value_type;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes =
+        framework::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
+    const int64_t data_size = sizeof(C);
+    std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                   [&](std::ptrdiff_t s) { return s * data_size; });
+
+    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
+    auto* out_data = reinterpret_cast<C*>(out->data<To>());
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= in_sizes[i];
+    }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
+                   out_data, factor);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = Ti;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes =
+        framework::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
+    {
+      const int64_t data_size = sizeof(R);
+      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
+
+    const auto& output_dim = out->dims();
+    const std::vector<size_t> out_sizes =
+        framework::vectorize<size_t>(output_dim);
+    std::vector<std::ptrdiff_t> out_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(output_dim));
+    {
+      const int64_t data_size = sizeof(C);
+      std::transform(out_strides.begin(), out_strides.end(),
+                     out_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
+
+    const auto* in_data = x->data<R>();
+    auto* out_data = reinterpret_cast<C*>(out->data<To>());
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet normalization factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= in_sizes[i];
+    }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
+                   out_data, factor);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = To;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes =
+        framework::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
+    {
+      const int64_t data_size = sizeof(C);
+      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
+
+    const auto& output_dim = out->dims();
+    const std::vector<size_t> out_sizes =
+        framework::vectorize<size_t>(output_dim);
+    std::vector<std::ptrdiff_t> out_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(output_dim));
+    {
+      const int64_t data_size = sizeof(R);
+      std::transform(out_strides.begin(), out_strides.end(),
+                     out_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
+
+    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
+    auto* out_data = out->data<R>();
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet normalization factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= out_sizes[i];
+    }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
+                   out_data, factor);
+  }
+};
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fft_c2c, ops::FFTC2COp, ops::FFTC2COpMaker,
+                  ops::FFTC2CGradOpMaker<paddle::framework::OpDesc>,
+                  ops::FFTC2CGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    fft_c2c, ops::FFTC2CKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FFTC2CKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OPERATOR(fft_c2c_grad, ops::FFTC2CGradOp);
+REGISTER_OP_CPU_KERNEL(
+    fft_c2c_grad,
+    ops::FFTC2CGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FFTC2CGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OPERATOR(fft_r2c, ops::FFTR2COp, ops::FFTR2COpMaker,
+                  ops::FFTR2CGradOpMaker<paddle::framework::OpDesc>,
+                  ops::FFTR2CGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    fft_r2c, ops::FFTR2CKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FFTR2CKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OPERATOR(fft_r2c_grad, ops::FFTR2CGradOp);
+REGISTER_OP_CPU_KERNEL(
+    fft_r2c_grad,
+    ops::FFTR2CGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FFTR2CGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OPERATOR(fft_c2r, ops::FFTC2ROp, ops::FFTC2ROpMaker,
+                  ops::FFTC2RGradOpMaker<paddle::framework::OpDesc>,
+                  ops::FFTC2RGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    fft_c2r, ops::FFTC2RKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FFTC2RKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OPERATOR(fft_c2r_grad, ops::FFTC2RGradOp);
+REGISTER_OP_CPU_KERNEL(
+    fft_c2r_grad,
+    ops::FFTC2RGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FFTC2RGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
new file mode 100644
index 0000000000000..9aa5ca39d737e
--- /dev/null
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -0,0 +1,643 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <cufft.h>
+#include <cufftXt.h>
+
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/operators/conj_op.h"
+#include "paddle/fluid/operators/spectral_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/dynload/cufft.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+using ScalarType = framework::proto::VarType::Type;
+const int64_t kMaxCUFFTNdim = 3;
+const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1;
+
+static inline std::string get_cufft_error_info(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+#ifndef __HIPCC__
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+#endif
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+    default:
+      std::ostringstream ss;
+      ss << "unknown error " << error;
+      return ss.str();
+  }
+}
+
+static inline void CUFFT_CHECK(cufftResult error) {
+  if (error != CUFFT_SUCCESS) {
+    PADDLE_THROW(platform::errors::External(get_cufft_error_info(error)));
+  }
+}
+
+// This struct is used to easily compute hashes of the
+// parameters. It will be the **key** to the plan cache.
+struct PlanKey {
+  // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3
+  int64_t signal_ndim_;
+  // These include additional batch dimension as well.
+  int64_t sizes_[kMaxDataNdim];
+  int64_t input_shape_[kMaxDataNdim];
+  int64_t output_shape_[kMaxDataNdim];
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+
+  PlanKey() = default;
+
+  PlanKey(const std::vector<int64_t>& in_shape,
+          const std::vector<int64_t>& out_shape,
+          const std::vector<int64_t>& signal_size, FFTTransformType fft_type,
+          ScalarType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_size.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+
+    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
+    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
+    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
+  }
+};
+
+// An RAII encapsulation of cuFFTHandle
+class CuFFTHandle {
+  ::cufftHandle handle_;
+
+ public:
+  CuFFTHandle() { CUFFT_CHECK(platform::dynload::cufftCreate(&handle_)); }
+
+  ::cufftHandle& get() { return handle_; }
+  const ::cufftHandle& get() const { return handle_; }
+
+  ~CuFFTHandle() {
+// Not using fftDestroy() for rocFFT to work around double freeing of handles
+#ifndef __HIPCC__
+    CUFFT_CHECK(platform::dynload::cufftDestroy(handle_));
+#endif
+  }
+};
+
+#ifdef __HIPCC__
+using plan_size_type = int;
+#else
+using plan_size_type = long long int;  // NOLINT
+#endif
+
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class CuFFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  CuFFTConfig(const CuFFTConfig&) = delete;
+  CuFFTConfig& operator=(CuFFTConfig const&) = delete;
+
+  explicit CuFFTConfig(const PlanKey& plan_key)
+      : CuFFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+
+  // sizes are full signal, including batch size and always two-sided
+  CuFFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+              FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+
+#ifdef __HIPCC__
+    hipfftType exec_type = [&] {
+      if (dtype == framework::proto::VarType::FP32) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_C2C;
+          case FFTTransformType::R2C:
+            return HIPFFT_R2C;
+          case FFTTransformType::C2R:
+            return HIPFFT_C2R;
+        }
+      } else if (dtype == framework::proto::VarType::FP64) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_Z2Z;
+          case FFTTransformType::R2C:
+            return HIPFFT_D2Z;
+          case FFTTransformType::C2R:
+            return HIPFFT_Z2D;
+        }
+      }
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "hipFFT only support transforms of type float32 and float64"));
+    }();
+#else
+    cudaDataType itype, otype, exec_type;
+    const auto complex_input = has_complex_input(fft_type);
+    const auto complex_output = has_complex_output(fft_type);
+    if (dtype == framework::proto::VarType::FP32) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (dtype == framework::proto::VarType::FP64) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (dtype == framework::proto::VarType::FP16) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "cuFFT only support transforms of type float16, float32 and "
+          "float64"));
+    }
+#endif
+
+    // disable auto allocation of workspace to use allocator from the framework
+    CUFFT_CHECK(platform::dynload::cufftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+// make plan
+#ifdef __HIPCC__
+    CUFFT_CHECK(hipfftMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
+        batch, &ws_size_t));
+#else
+
+    CUFFT_CHECK(platform::dynload::cufftXtMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+#endif
+
+    ws_size = ws_size_t;
+  }
+
+  const cufftHandle& plan() const { return plan_ptr.get(); }
+
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+
+ private:
+  CuFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+
+// Execute a pre-planned transform
+static void exec_cufft_plan(const CuFFTConfig& config, void* in_data,
+                            void* out_data, bool forward) {
+  auto& plan = config.plan();
+#ifdef __HIPCC__
+  auto value_type = config.data_type();
+  if (value_type == framework::proto::VarType::FP32) {
+    switch (config.transform_type()) {
+      case FFTTransformType::C2C: {
+        CUFFT_CHECK(hipfftExecC2C(plan, static_cast<hipfftComplex*>(in_data),
+                                  static_cast<hipfftComplex*>(out_data),
+                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case FFTTransformType::R2C: {
+        CUFFT_CHECK(hipfftExecR2C(plan, static_cast<hipfftReal*>(in_data),
+                                  static_cast<hipfftComplex*>(out_data)));
+        return;
+      }
+      case FFTTransformType::C2R: {
+        CUFFT_CHECK(hipfftExecC2R(plan, static_cast<hipfftComplex*>(in_data),
+                                  static_cast<hipfftReal*>(out_data)));
+        return;
+      }
+    }
+  } else if (value_type == framework::proto::VarType::FP64) {
+    switch (config.transform_type()) {
+      case FFTTransformType::C2C: {
+        CUFFT_CHECK(hipfftExecZ2Z(plan,
+                                  static_cast<hipfftDoubleComplex*>(in_data),
+                                  static_cast<hipfftDoubleComplex*>(out_data),
+                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case FFTTransformType::R2C: {
+        CUFFT_CHECK(hipfftExecD2Z(plan, static_cast<hipfftDoubleReal*>(in_data),
+                                  static_cast<hipfftDoubleComplex*>(out_data)));
+        return;
+      }
+      case FFTTransformType::C2R: {
+        CUFFT_CHECK(hipfftExecZ2D(plan,
+                                  static_cast<hipfftDoubleComplex*>(in_data),
+                                  static_cast<hipfftDoubleReal*>(out_data)));
+        return;
+      }
+    }
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "hipFFT only support transforms of type float32 and float64"));
+#else
+  CUFFT_CHECK(platform::dynload::cufftXtExec(
+      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
+#endif
+}
+
+// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
+// onesided c2r)
+template <typename DeviceContext, typename Ti, typename To>
+void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
+              const std::vector<int64_t>& dim, bool forward) {
+  const auto x_dims = framework::vectorize(X->dims());
+  const auto out_dims = framework::vectorize(out->dims());
+  const int64_t ndim = static_cast<int64_t>(X->dims().size());
+  const int64_t signal_ndim = static_cast<int64_t>(dim.size());
+  const int64_t batch_dims = ndim - signal_ndim;
+  auto tensor_place = ctx.GetPlace();
+
+  // Transpose batch dimensions first, then with transforming dims
+  std::vector<int> dim_permute(ndim);
+  std::vector<int> reverse_dim_permute(ndim);
+  std::vector<int64_t> trans_dims(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), int{0});
+  std::vector<bool> is_transformed_dim(ndim);
+  for (const auto& d : dim) {
+    is_transformed_dim[d] = true;
+  }
+  auto batch_end =
+      std::partition(dim_permute.begin(), dim_permute.end(),
+                     [&](int64_t d) { return !is_transformed_dim[d]; });
+  std::sort(dim_permute.begin(), batch_end);
+  std::copy(dim.cbegin(), dim.cend(), batch_end);
+
+  for (size_t i = 0; i < ndim; i++) {
+    trans_dims[i] = x_dims[dim_permute[i]];  // shape of input transpose
+    reverse_dim_permute[dim_permute[i]] =
+        static_cast<int>(i);  // reverse of dim permute
+  }
+  framework::Tensor input;
+  input.Resize(framework::make_ddim(trans_dims));
+  input.mutable_data<Ti>(tensor_place);
+  /*
+  auto in_ret = TransposeSimple<Ti>::run(ctx, *X, dim_permute, input);
+  if (!in_ret) {
+    TransCompute<DeviceContext, Ti>(ndim, ctx, *X, input, dim_permute);
+  }
+  */
+  TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &input, dim_permute);
+
+  // Reshape batch dimensions into a single dimension
+  std::vector<int64_t> batched_sizes(signal_ndim + 1);
+  auto batch_size =
+      std::accumulate(trans_dims.begin(), trans_dims.begin() + batch_dims,
+                      static_cast<int>(1), std::multiplies<int>());
+  batched_sizes[0] = batch_size;
+  std::copy(trans_dims.begin() + batch_dims, trans_dims.end(),
+            batched_sizes.begin() + 1);
+  input.Resize(framework::make_ddim(batched_sizes));
+
+  // Check the shape of transforming dims with input and output
+  std::vector<int64_t> signal_size(signal_ndim + 1);
+  signal_size[0] = batch_size;
+  for (int64_t i = 0; i < signal_ndim; ++i) {
+    auto in_size = input.dims()[i + 1];
+    auto out_size = out_dims[dim[i]];
+    signal_size[i + 1] = std::max(in_size, out_size);
+    PADDLE_ENFORCE_EQ(
+        (in_size == signal_size[i + 1] ||
+         in_size == (signal_size[i + 1] / 2) + 1),
+        true,
+        platform::errors::InvalidArgument(
+            "The dimension[%d] of Input size: [%d] must be equal or half to "
+            "The dimension[%d] of Output size: [%d]",
+            dim[i], in_size, dim[i], out_size));
+    PADDLE_ENFORCE_EQ(
+        (out_size == signal_size[i + 1] ||
+         out_size == (signal_size[i + 1] / 2) + 1),
+        true,
+        platform::errors::InvalidArgument(
+            "The dimension[%d] of Output size: [%d] must be equal or half to "
+            "The dimension[%d] of Input size: [%d]",
+            dim[i], out_size, dim[i], in_size));
+  }
+
+  std::vector<int64_t> reshape_out_sizes(ndim);
+  for (size_t i = 0; i < ndim; ++i) {
+    reshape_out_sizes[i] = out_dims[dim_permute[i]];
+  }
+  std::vector<int64_t> batched_out_sizes(batched_sizes.begin(),
+                                         batched_sizes.end());
+  for (size_t i = 0; i < dim.size(); ++i) {
+    batched_out_sizes[i + 1] = out_dims[dim[i]];
+  }
+
+  // output
+  framework::Tensor output;
+  output.Resize(framework::make_ddim(batched_out_sizes));
+  output.mutable_data<To>(tensor_place);
+
+  // Create the transform plan (either from cache or locally)
+  const auto value_type = framework::IsComplexType(input.type())
+                              ? framework::ToRealType(input.type())
+                              : input.type();
+  auto fft_type = GetFFTTransformType(input.type(), output.type());
+  PlanKey Key(framework::vectorize(input.dims()),
+              framework::vectorize(output.dims()), signal_size, fft_type,
+              value_type);
+  CuFFTConfig uncached_plan(Key);
+  CuFFTConfig* config = &uncached_plan;
+  auto& plan = config->plan();
+
+  // prepare cufft for execution
+  CUFFT_CHECK(platform::dynload::cufftSetStream(plan, ctx.stream()));
+  framework::Tensor workspace_tensor;
+  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
+  CUFFT_CHECK(
+      platform::dynload::cufftSetWorkArea(plan, workspace_tensor.data<To>()));
+
+  // execute transform plan
+  if (fft_type == FFTTransformType::C2R && forward) {
+    forward = false;
+    framework::Tensor input_conj(input.type());
+    input_conj.mutable_data<Ti>(input.dims(), ctx.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx, input.numel());
+    math::ConjFunctor<Ti> functor(input.data<Ti>(), input.numel(),
+                                  input_conj.data<Ti>());
+    for_range(functor);
+    exec_cufft_plan(*config, input_conj.data<void>(), output.data<void>(),
+                    forward);
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    forward = true;
+    framework::Tensor out_conj(output.type());
+    out_conj.mutable_data<To>(output.dims(), ctx.GetPlace());
+    exec_cufft_plan(*config, input.data<void>(), out_conj.data<void>(),
+                    forward);
+
+    platform::ForRange<DeviceContext> for_range(ctx, output.numel());
+    math::ConjFunctor<To> functor(out_conj.data<To>(), output.numel(),
+                                  output.data<To>());
+    for_range(functor);
+  } else {
+    exec_cufft_plan(*config, input.data<void>(), output.data<void>(), forward);
+  }
+
+  // Inverting output by reshape and transpose to original batch and dimension
+  output.Resize(framework::make_ddim(reshape_out_sizes));
+  out->Resize(framework::make_ddim(out_dims));
+  TransCompute<DeviceContext, To>(ndim, ctx, output, out, reverse_dim_permute);
+}
+
+// Calculates the normalization constant
+double fft_normalization_scale(FFTNormMode normalization,
+                               const std::vector<int64_t>& sizes,
+                               const std::vector<int64_t>& dims) {
+  // auto norm = static_cast<fft_norm_mode>(normalization);
+  if (normalization == FFTNormMode::none) {
+    return static_cast<double>(1.0);
+  }
+
+  int64_t signal_numel = 1;
+  for (auto dim : dims) {
+    signal_numel *= sizes[dim];
+  }
+  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
+                                 ? std::sqrt(signal_numel)
+                                 : static_cast<double>(signal_numel);
+  return static_cast<double>(1.0 / scale_denom);
+}
+
+template <typename DeviceContext, typename T>
+void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
+                        FFTNormMode normalization,
+                        const std::vector<int64_t>& sizes,
+                        const std::vector<int64_t>& axes) {
+  double scale = fft_normalization_scale(normalization, sizes, axes);
+  if (scale != 1.0) {
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto dev = ctx.eigen_device();
+    EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
+                                          static_cast<T>(scale),
+                                          static_cast<T>(0), false);
+  } else {
+    framework::TensorCopy(*in, ctx.GetPlace(), out);
+  }
+}
+}  // anonymous namespace
+
+// Use the optimized path to perform single R2C or C2R if transformation dim is
+// supported by cuFFT
+bool use_optimized_cufft_path(const std::vector<int64_t>& axes) {
+  // For performance reason, when axes starts with (0, 1), do not use the
+  // optimized path.
+  if (axes.size() > kMaxCUFFTNdim ||
+      (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    if (axes.empty()) {
+      framework::TensorCopy(*X, ctx.GetPlace(), out);
+      return;
+    }
+
+    framework::Tensor* p_out = out;
+    std::vector<int64_t> out_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> working_axes(axes.begin(), axes.end());
+    std::vector<int64_t> first_dims;
+    size_t max_dims;
+    framework::Tensor working_tensor;
+    working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+    framework::Tensor* p_working_tensor = &working_tensor;
+    framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
+
+    while (true) {
+      max_dims =
+          std::min(static_cast<size_t>(kMaxCUFFTNdim), working_axes.size());
+      first_dims.assign(working_axes.end() - max_dims, working_axes.end());
+
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
+                                                    p_out, first_dims, forward);
+      working_axes.resize(working_axes.size() - max_dims);
+      first_dims.clear();
+
+      if (working_axes.empty()) {
+        break;
+      }
+
+      std::swap(p_out, p_working_tensor);
+    }
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, p_out, out, normalization, out_dims, axes);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    std::vector<int64_t> in_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(out->dims());
+
+    if (use_optimized_cufft_path(axes)) {
+      framework::Tensor x_copy(X->type());
+      x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+      framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
+                                                    forward);
+    } else {
+      framework::Tensor temp_tensor;
+      temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+      const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
+
+      FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
+      c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
+
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
+                                                    {axes.back()}, forward);
+    }
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, out, out, normalization, out_dims, axes);
+  }
+};
+
+// n dimension real to complex FFT use cufft lib
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    // Step1: R2C transform on the last dimension
+    framework::Tensor* r2c_out = out;
+    const std::vector<int64_t> last_dim{axes.back()};
+    std::vector<int64_t> out_dims = framework::vectorize(out->dims());
+    exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
+                                                  forward);
+
+    // Step2: C2C transform on the remaining dimension
+    framework::Tensor c2c_out;
+    if (axes.size() > 1) {
+      c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
+      std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
+      FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
+      fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
+                   forward);
+    }
+
+    const auto in_sizes = framework::vectorize(X->dims());
+    framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, norm_tensor, out, normalization, in_sizes, axes);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fft_c2c, ops::FFTC2CKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FFTC2CKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fft_c2c_grad,
+    ops::FFTC2CGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FFTC2CGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fft_c2r, ops::FFTC2RKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FFTC2RKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fft_c2r_grad,
+    ops::FFTC2RGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FFTC2RGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fft_r2c, ops::FFTR2CKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FFTR2CKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fft_r2c_grad,
+    ops::FFTR2CGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FFTR2CGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
new file mode 100644
index 0000000000000..e549c4a454b19
--- /dev/null
+++ b/paddle/fluid/operators/spectral_op.h
@@ -0,0 +1,461 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#define NOMINMAX  // to use std::min std::max correctly on windows
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/conj_op.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/operators/math/padding.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/for_range.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "thrust/device_vector.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+enum class FFTNormMode : int64_t {
+  none,       // No normalization
+  by_sqrt_n,  // Divide by sqrt(signal_size)
+  by_n,       // Divide by signal_size
+};
+
+FFTNormMode get_norm_from_string(const std::string& norm, bool forward);
+
+// Enum representing the FFT type
+enum class FFTTransformType : int64_t {
+  C2C = 0,  // Complex-to-complex
+  R2C,      // Real-to-complex
+  C2R,      // Complex-to-real
+};
+
+// Create transform type enum from bools representing if input and output are
+// complex
+inline FFTTransformType GetFFTTransformType(
+    framework::proto::VarType::Type input_dtype,
+    framework::proto::VarType::Type output_dtype) {
+  auto complex_input = framework::IsComplexType(input_dtype);
+  auto complex_output = framework::IsComplexType(output_dtype);
+  if (complex_input && complex_output) {
+    return FFTTransformType::C2C;
+  } else if (complex_input && !complex_output) {
+    return FFTTransformType::C2R;
+  } else if (!complex_input && complex_output) {
+    return FFTTransformType::R2C;
+  }
+  PADDLE_THROW(
+      platform::errors::InvalidArgument("Real to real FFTs are not supported"));
+}
+
+// Returns true if the transform type has complex input
+inline bool has_complex_input(FFTTransformType type) {
+  switch (type) {
+    case FFTTransformType::C2C:
+    case FFTTransformType::C2R:
+      return true;
+
+    case FFTTransformType::R2C:
+      return false;
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument("Unknown FFTTransformType"));
+}
+
+// Returns true if the transform type has complex output
+inline bool has_complex_output(FFTTransformType type) {
+  switch (type) {
+    case FFTTransformType::C2C:
+    case FFTTransformType::R2C:
+      return true;
+
+    case FFTTransformType::C2R:
+      return false;
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument("Unknown FFTTransformType"));
+}
+
+template <typename T>
+struct FFTFillConjGradFunctor {
+  T* input_;
+  const size_t axis_;
+  const int64_t* strides_;
+  const size_t double_length_;
+
+  FFTFillConjGradFunctor(T* input, size_t axis, const int64_t* strides,
+                         size_t double_length)
+      : input_(input),
+        axis_(axis),
+        strides_(strides),
+        double_length_(double_length) {}
+
+  HOSTDEVICE void operator()(size_t index) {
+    size_t offtset = index;  // back
+    size_t index_i;
+    for (size_t i = 0; i <= axis_; i++) {
+      index_i = offtset / strides_[i];
+      offtset %= strides_[i];
+    }
+
+    if ((0 < index_i) && (index_i < double_length_ + 1)) {
+      input_[index] *= static_cast<T>(2);
+    }
+  }
+};
+
+template <typename DeviceContext, typename Ti, typename To>
+struct FFTC2CFunctor {
+  void operator()(const DeviceContext& ctx, const Tensor* X, Tensor* out,
+                  const std::vector<int64_t>& axes, FFTNormMode normalization,
+                  bool forward);
+};
+
+template <typename DeviceContext, typename Ti, typename To>
+struct FFTR2CFunctor {
+  void operator()(const DeviceContext& ctx, const Tensor* X, Tensor* out,
+                  const std::vector<int64_t>& axes, FFTNormMode normalization,
+                  bool forward);
+};
+
+template <typename DeviceContext, typename Ti, typename To>
+struct FFTC2RFunctor {
+  void operator()(const DeviceContext& ctx, const Tensor* X, Tensor* out,
+                  const std::vector<int64_t>& axes, FFTNormMode normalization,
+                  bool forward);
+};
+
+// Giving a linear destination index and strides of tensor, get_idx return the
+// corresponding linear position of source tensor.
+// The linear index is the position of flatten tensor.
+// Giving a linear destination index and strides of tensor, get_idx return the
+// corresponding linear position of source tensor.
+// The linear index is the position of flatten tensor.
+HOSTDEVICE inline int64_t get_src_idx(const int64_t dst_idx,
+                                      const int64_t* dst_strides,
+                                      const int64_t* dst_shape,
+                                      const int64_t* src_strides,
+                                      const bool* is_fft_axis, const bool conj,
+                                      const int64_t rank) {
+  int64_t src_idx = 0;
+  int64_t quotient = dst_idx;
+  int64_t remainder = 0;
+
+  for (int64_t i = 0; i < rank; i++) {
+    remainder = quotient % dst_strides[i];
+    quotient = quotient / dst_strides[i];
+    if (conj && is_fft_axis[i]) {
+      src_idx += ((dst_shape[i] - quotient) % dst_shape[i]) * src_strides[i];
+    } else {
+      src_idx += src_strides[i] * quotient;
+    }
+    quotient = remainder;
+  }
+
+  return src_idx;
+}
+
+HOSTDEVICE inline bool is_conj_part(const int64_t dst_idx,
+                                    const int64_t* dst_strides,
+                                    const int64_t last_axis,
+                                    const int64_t last_axis_size) {
+  int64_t quotient = dst_idx;
+  int64_t remainder = 0;
+
+  for (int64_t i = 0; i < last_axis + 1; i++) {
+    remainder = quotient % dst_strides[i];
+    quotient = quotient / dst_strides[i];
+
+    if ((i == last_axis) && (quotient > last_axis_size - 1)) {
+      return true;
+    }
+
+    quotient = remainder;
+  }
+
+  return false;
+}
+
+// FFTFillConjFunctor fill the destination tensor with source tensor and
+// conjugate symmetry element of source tensor .
+// Use framework::ForRange to iterate destination element with
+// supporting different device
+template <typename C>
+struct FFTFillConjFunctor {
+  FFTFillConjFunctor(const C* src_data, C* dst_data, const int64_t* src_strides,
+                     const int64_t* dst_strides, const int64_t* dst_shape,
+                     const bool* is_fft_axis, const int64_t last_axis,
+                     const int64_t last_axis_size, const int64_t rank)
+      : src_data_(src_data),
+        dst_data_(dst_data),
+        src_strides_(src_strides),
+        dst_strides_(dst_strides),
+        dst_shape_(dst_shape),
+        is_fft_axis_(is_fft_axis),
+        last_axis_(last_axis),
+        last_axis_size_(last_axis_size),
+        rank_(rank) {}
+  HOSTDEVICE void operator()(int64_t dst_idx) {
+    if (is_conj_part(dst_idx, dst_strides_, last_axis_, last_axis_size_)) {
+      const auto conj_idx =
+          get_src_idx(dst_idx, dst_strides_, dst_shape_, src_strides_,
+                      is_fft_axis_, true, rank_);
+      auto src_value = src_data_[conj_idx];
+      auto conj_value = C(src_value.real, -src_value.imag);
+      dst_data_[dst_idx] = conj_value;
+    } else {
+      const auto copy_idx =
+          get_src_idx(dst_idx, dst_strides_, dst_shape_, src_strides_,
+                      is_fft_axis_, false, rank_);
+      dst_data_[dst_idx] = src_data_[copy_idx];
+    }
+  }
+
+  const C* src_data_;
+  C* dst_data_;
+  const int64_t* src_strides_;
+  const int64_t* dst_strides_;
+  const int64_t* dst_shape_;
+  const bool* is_fft_axis_;
+  const int64_t last_axis_;
+  const int64_t last_axis_size_;
+  const int64_t rank_;
+};
+
+template <typename DeviceContext, typename C>
+void fill_conj(const DeviceContext& ctx, const Tensor* src, Tensor* dst,
+               const std::vector<int64_t>& axes) {
+  std::vector<int64_t> src_strides_v =
+      framework::vectorize<int64_t>(framework::stride(src->dims()));
+  std::vector<int64_t> dst_strides_v =
+      framework::vectorize<int64_t>(framework::stride(dst->dims()));
+  std::vector<int64_t> dst_shape_v = framework::vectorize<int64_t>(dst->dims());
+  const auto src_data = src->data<C>();
+  auto dst_data = dst->data<C>();
+  const auto last_axis = axes.back();
+  const auto last_axis_size = dst->dims().at(last_axis) / 2 + 1;
+  const int64_t rank = dst->dims().size();
+  auto _is_fft_axis = std::make_unique<bool[]>(rank);
+  for (const auto i : axes) {
+    _is_fft_axis[i] = true;
+  }
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+  const thrust::device_vector<int64_t> src_strides_g(src_strides_v);
+  const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data());
+  const thrust::device_vector<int64_t> dst_strides_g(dst_strides_v);
+  const auto dst_strides = thrust::raw_pointer_cast(dst_strides_g.data());
+  const thrust::device_vector<int64_t> dst_shape_g(dst_shape_v);
+  const auto dst_shape = thrust::raw_pointer_cast(dst_shape_g.data());
+  const thrust::device_vector<bool> is_fft_axis_g(_is_fft_axis.get(),
+                                                  _is_fft_axis.get() + rank);
+  const auto p_is_fft_axis = thrust::raw_pointer_cast(is_fft_axis_g.data());
+#else
+  const auto src_strides = src_strides_v.data();
+  const auto dst_strides = dst_strides_v.data();
+  const auto dst_shape = dst_shape_v.data();
+  const auto p_is_fft_axis = _is_fft_axis.get();
+#endif
+  platform::ForRange<DeviceContext> for_range(ctx, dst->numel());
+  FFTFillConjFunctor<C> fill_conj_functor(src_data, dst_data, src_strides,
+                                          dst_strides, dst_shape, p_is_fft_axis,
+                                          last_axis, last_axis_size, rank);
+  for_range(fill_conj_functor);
+}
+
+template <typename DeviceContext, typename T>
+class FFTC2CKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
+    const std::string& norm_str = ctx.Attr<std::string>("normalization");
+    const bool forward = ctx.Attr<bool>("forward");
+    const auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Out");
+
+    y->mutable_data<C>(ctx.GetPlace());
+    auto normalization = get_norm_from_string(norm_str, forward);
+
+    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
+    fft_c2c_func(dev_ctx, x, y, axes, normalization, forward);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FFTC2CGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
+    const std::string& norm_str = ctx.Attr<std::string>("normalization");
+    const bool forward = ctx.Attr<bool>("forward");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    dx->mutable_data<C>(ctx.GetPlace());
+    auto normalization = get_norm_from_string(norm_str, forward);
+
+    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
+    fft_c2c_func(dev_ctx, dy, dx, axes, normalization, !forward);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FFTR2CKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
+    const std::string& norm_str = ctx.Attr<std::string>("normalization");
+    const bool forward = ctx.Attr<bool>("forward");
+    const bool onesided = ctx.Attr<bool>("onesided");
+    const auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Out");
+
+    y->mutable_data<C>(ctx.GetPlace());
+    auto normalization = get_norm_from_string(norm_str, forward);
+
+    FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
+
+    if (onesided) {
+      fft_r2c_func(dev_ctx, x, y, axes, normalization, forward);
+    } else {
+      framework::DDim onesided_dims(y->dims());
+      const int64_t onesided_last_axis_size = y->dims().at(axes.back()) / 2 + 1;
+      onesided_dims.at(axes.back()) = onesided_last_axis_size;
+      framework::Tensor onesided_out;
+      onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
+      fft_r2c_func(dev_ctx, x, &onesided_out, axes, normalization, forward);
+      fill_conj<DeviceContext, C>(dev_ctx, &onesided_out, y, axes);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FFTR2CGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    const auto axes = ctx.Attr<std::vector<int64_t>>("axes");
+    const std::string& norm_str = ctx.Attr<std::string>("normalization");
+    const bool forward = ctx.Attr<bool>("forward");
+    const bool onesided = ctx.Attr<bool>("onesided");
+
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+    framework::Tensor complex_dx;
+    complex_dx.mutable_data<C>(dx->dims(), ctx.GetPlace());
+
+    auto normalization = get_norm_from_string(norm_str, forward);
+    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
+
+    if (!onesided) {
+      fft_c2c_func(dev_ctx, dy, &complex_dx, axes, normalization, !forward);
+    } else {
+      framework::Tensor full_dy;
+      full_dy.mutable_data<C>(dx->dims(), ctx.GetPlace());
+      auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
+                                          dy->dims().at(axes.back()));
+      auto rank = dy->dims().size();
+
+      std::vector<int> pads(rank * 2, 0);
+      pads[axes.back() * 2 + 1] = zero_length;
+
+      paddle::operators::math::PaddingFunctor<DeviceContext, C>(
+          rank, ctx, pads, static_cast<C>(0), *dy, &full_dy);
+      fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization,
+                   !forward);
+    }
+    framework::TransComplexToReal(dx->type(), complex_dx.type(), complex_dx,
+                                  dx);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FFTC2RKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
+    const std::string& norm_str = ctx.Attr<std::string>("normalization");
+    const bool forward = ctx.Attr<bool>("forward");
+    const auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Out");
+
+    y->mutable_data<T>(ctx.GetPlace());
+    auto normalization = get_norm_from_string(norm_str, forward);
+
+    FFTC2RFunctor<DeviceContext, C, T> fft_c2r_func;
+    fft_c2r_func(dev_ctx, x, y, axes, normalization, forward);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FFTC2RGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
+    const std::string& norm_str = ctx.Attr<std::string>("normalization");
+    const bool forward = ctx.Attr<bool>("forward");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    C* pdx = dx->mutable_data<C>(ctx.GetPlace());
+    auto normalization = get_norm_from_string(norm_str, forward);
+
+    FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
+    fft_r2c_func(dev_ctx, dy, dx, axes, normalization, !forward);
+
+    const int64_t double_length =
+        dy->dims()[axes.back()] - dx->dims()[axes.back()];
+    const framework::DDim strides = framework::stride(dx->dims());
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+    const thrust::device_vector<int64_t> strides_g(
+        framework::vectorize(strides));
+    const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
+#else
+    const int64_t* pstrides = strides.Get();
+#endif
+
+    FFTFillConjGradFunctor<C> func(pdx, axes.back(), pstrides, double_length);
+    size_t limit = dx->numel();
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(func);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index cf24faf25db70..8894ca650de03 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -114,11 +114,11 @@ class SqueezeOp : public framework::OperatorWithKernel {
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
+//    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+//      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+//                                     framework::DataLayout::kMKLDNN,
+//                                     framework::LibraryType::kMKLDNN);
+//    }
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -141,11 +141,11 @@ class SqueezeGradOp : public framework::OperatorWithKernel {
         ctx, framework::GradVarName("Out"));
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
+//    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+//      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+//                                     framework::DataLayout::kMKLDNN,
+//                                     framework::LibraryType::kMKLDNN);
+//    }
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -242,11 +242,11 @@ class Squeeze2Op : public framework::OperatorWithKernel {
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
+//    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+//      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+//                                     framework::DataLayout::kMKLDNN,
+//                                     framework::LibraryType::kMKLDNN);
+//    }
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -288,11 +288,11 @@ class Squeeze2GradOp : public framework::OperatorWithKernel {
         ctx, framework::GradVarName("Out"));
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
+//    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+//      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+//                                     framework::DataLayout::kMKLDNN,
+//                                     framework::LibraryType::kMKLDNN);
+//    }
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -389,7 +389,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::SqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext,
+                       paddle::platform::complex<float>>,
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext,
+                       paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     squeeze_grad,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -398,7 +402,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
+                           paddle::platform::complex<float>>,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
+                           paddle::platform::complex<double>>);
+
 REGISTER_OP_CPU_KERNEL(
     squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
     ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
@@ -406,7 +415,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
     ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
+                        paddle::platform::complex<float>>,
+    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
+                        paddle::platform::complex<double>>);
+
 REGISTER_OP_CPU_KERNEL(
     squeeze2_grad,
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -415,4 +429,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
+                            paddle::platform::complex<float>>,
+    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
+                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc
old mode 100755
new mode 100644
index 23431df12b681..9b4000c26ff6a
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
@@ -25,7 +25,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::SqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext,
+                       paddle::platform::complex<float>>,
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext,
+                       paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     squeeze_grad,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -35,7 +39,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext,
+                           paddle::platform::complex<float>>,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext,
+                           paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
@@ -44,7 +52,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::complex<float>>,
+    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     squeeze2_grad,
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -54,4 +66,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
+                            paddle::platform::complex<float>>,
+    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
+                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 71d106c211f71..d592c62d499b3 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -289,10 +289,20 @@ struct DeviceIndependenceTensorOperations {
   framework::Tensor Div(const framework::Tensor& x,
                         const framework::Tensor& y) {
     framework::Tensor ret;
-    std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    ret.Resize(framework::make_ddim(out_shape));
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
-        context, &x, &y, -1, DivFunctor<T>(), &ret);
+    if (x.type() != y.type()) {
+      ret.mutable_data<T>(x.dims(), context.GetPlace());
+      auto x_vector = EigenVector<T>::Flatten(x);
+      auto y_vector = EigenVector<ValueType>::Flatten(y);
+      auto out_vector = EigenVector<T>::Flatten(ret);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      out_vector.device(place) = x_vector / y_vector;
+    } else {
+      std::vector<int> out_shape = GetBroadcastShape({&x, &y});
+      ret.Resize(framework::make_ddim(out_shape));
+      ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
+          context, &x, &y, -1, DivFunctor<T>(), &ret);
+    }
     return ret;
   }
   framework::Tensor Add(const framework::Tensor& x,
@@ -330,7 +340,8 @@ struct DeviceIndependenceTensorOperations {
     NameInTensorMap inputs({{"X", {&x}}});
     return CreateOpRunAndReturnTensor("reduce_max", inputs, attrs, out_dim);
   }
-
+  // Support float and complex type subtraction，the default is T type
+  template <typename InT = T>
   framework::Tensor Sub(const framework::Tensor& x,
                         const framework::Tensor& y) {
     framework::Tensor ret;
@@ -340,18 +351,18 @@ struct DeviceIndependenceTensorOperations {
 #if defined(__NVCC__) || defined(__HIPCC__)
       // For GPU, there is no need to define XxxInverseFunctor and call
       // ElementwiseComputeEx in two branches.
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          context, &x, &y, -1, SubFunctor<T>(), &ret);
+      ElementwiseComputeEx<SubFunctor<InT>, DeviceContext, InT>(
+          context, &x, &y, -1, SubFunctor<InT>(), &ret);
 #endif
     } else {
       if (x.dims().size() >= y.dims().size()) {
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            context, &x, &y, -1, SubFunctor<T>(), &ret);
+        ElementwiseComputeEx<SubFunctor<InT>, DeviceContext, InT>(
+            context, &x, &y, -1, SubFunctor<InT>(), &ret);
       } else {
-        ElementwiseComputeEx<InverseSubFunctor<T>, DeviceContext, T>(
-            // This is copyed from elementwise_sub, which means we
-            // need reverse will xrank < yrank
-            context, &x, &y, -1, InverseSubFunctor<T>(), &ret);
+        // This is copyed from elementwise_sub, which means we
+        // need reverse will xrank < yrank
+        ElementwiseComputeEx<InverseSubFunctor<InT>, DeviceContext, InT>(
+            context, &x, &y, -1, InverseSubFunctor<InT>(), &ret);
       }
     }
     return ret;
@@ -461,37 +472,6 @@ struct DeviceIndependenceTensorOperations {
     return out;
   }
 
-  // Support x and y are different data types
-  Tensor Div_(const Tensor& x, const Tensor& y) {
-    Tensor out;
-    out.mutable_data<T>(x.dims(), context.GetPlace());
-    auto x_vector = EigenVector<T>::Flatten(x);
-    auto y_vector = EigenVector<ValueType>::Flatten(y);
-    auto out_vector = EigenVector<T>::Flatten(out);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    out_vector.device(place) = x_vector / y_vector;
-    return out;
-  }
-
-  framework::Tensor Sub_(const framework::Tensor& x,
-                         const framework::Tensor& y) {
-    framework::Tensor ret;
-    std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    ret.Resize(framework::make_ddim(out_shape));
-    if (x.dims().size() >= y.dims().size()) {
-      ElementwiseComputeEx<SubFunctor<ValueType>, DeviceContext, ValueType>(
-          context, &x, &y, -1, SubFunctor<ValueType>(), &ret);
-    } else {
-      ElementwiseComputeEx<InverseSubFunctor<ValueType>, DeviceContext,
-                           ValueType>(
-          // This is copyed from elementwise_sub, which means we
-          // need reverse will xrank < yrank
-          context, &x, &y, -1, InverseSubFunctor<ValueType>(), &ret);
-    }
-    return ret;
-  }
-
  private:
   const framework::ExecutionContext& context;
   BlasT<DeviceContext, T> GetBlas() {
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 2b02d76d08e2d..35dcb25f6b784 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -350,7 +350,8 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext,
                          paddle::platform::complex<float>>,
@@ -358,6 +359,7 @@ REGISTER_OP_CPU_KERNEL(
                          paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
@@ -373,7 +375,8 @@ REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad,
                   ops::Transpose2DoubleGradMaker<paddle::imperative::OpBase>);
 
 REGISTER_OP_CPU_KERNEL(
-    transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext, int32_t>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
@@ -383,6 +386,7 @@ REGISTER_OP_CPU_KERNEL(
                          paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int32_t>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu
index 383fc6a5b9b32..5bcfbee5c53bc 100644
--- a/paddle/fluid/operators/transpose_op.cu
+++ b/paddle/fluid/operators/transpose_op.cu
@@ -83,6 +83,7 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     transpose,
+    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
@@ -92,6 +93,7 @@ REGISTER_OP_CUDA_KERNEL(
                             paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
+    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
@@ -103,6 +105,7 @@ REGISTER_OP_CUDA_KERNEL(
 
 REGISTER_OP_CUDA_KERNEL(
     transpose2,
+    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
@@ -114,6 +117,7 @@ REGISTER_OP_CUDA_KERNEL(
                             paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     transpose2_grad,
+    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
old mode 100755
new mode 100644
index ed7a4f92f092b..77b06fb2d4b72
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -362,7 +362,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     unsqueeze_grad,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -371,7 +375,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
@@ -379,7 +387,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     unsqueeze2_grad,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -388,4 +400,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::complex<float>>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc
old mode 100755
new mode 100644
index 2781b3ef8c838..d1fe251ef7706
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
@@ -25,7 +25,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     unsqueeze_grad,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -36,7 +40,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     unsqueeze2,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
@@ -46,7 +54,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     unsqueeze2_grad,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -57,4 +69,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::complex<float>>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index 912d538d5e951..770369e64f46f 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -114,6 +114,11 @@ inline T GetValue(const framework::Tensor* x) {
   if (!platform::is_cpu_place(x->place())) {
     framework::Tensor cpu_x;
     framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    const platform::DeviceContext* dev_ctx = pool.Get(x->place());
+    dev_ctx->Wait();
+#endif
     value = cpu_x.data<T>()[0];
   } else {
     value = x->data<T>()[0];
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index d99f991911e9c..2540170ed54fb 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -37,13 +37,13 @@ if (WITH_PYTHON)
   endif(NOT WIN32)
 endif()
 
-cc_library(flags SRCS flags.cc DEPS gflags)
+cc_library(flags SRCS flags.cc DEPS gflags boost)
 cc_library(denormal SRCS denormal.cc DEPS)
 
 cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
-set(enforce_deps flags errors boost)
+set(enforce_deps flags errors boost flags)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -60,6 +60,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 IF(WITH_GPU)
     nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+    nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h
index 2c1b42ea4882d..065ccd375c94c 100644
--- a/paddle/fluid/platform/complex.h
+++ b/paddle/fluid/platform/complex.h
@@ -60,6 +60,8 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
   T real;
   T imag;
 
+  using value_type = T;
+
   complex() = default;
   complex(const complex<T>& o) = default;
   complex& operator=(const complex<T>& o) = default;
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 6405b55621766..e486044486571 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 #endif  // _WIN32
 
 #include <algorithm>
-#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
 DECLARE_uint64(initial_cpu_memory_in_mb);
@@ -42,7 +42,8 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory, true,
+                            "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace platform {
@@ -54,7 +55,9 @@ size_t CpuTotalPhysicalMemory() {
   mib[1] = HW_MEMSIZE;
   int64_t size = 0;
   size_t len = sizeof(size);
-  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) {
+    return static_cast<size_t>(size);
+  }
   return 0L;
 #elif defined(_WIN32)
   MEMORYSTATUSEX sMeminfo;
diff --git a/paddle/fluid/platform/cuda_profiler.cc b/paddle/fluid/platform/cuda_profiler.cc
new file mode 100644
index 0000000000000..998dd80dc5e7d
--- /dev/null
+++ b/paddle/fluid/platform/cuda_profiler.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/cuda_profiler.h"
+
+namespace paddle {
+namespace platform {
+
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::string config_file) {
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
+                 platform::errors::InvalidArgument(
+                     "Unsupported cuda profiler output mode, expect `kvp` or "
+                     "`csv`, but received `%s`.",
+                     output_mode));
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+}
+
+void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); }
+
+void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); }
+
+#ifndef _WIN32
+void CudaNvtxRangePush(std::string name) {
+  dynload::nvtxRangePushA(name.c_str());
+}
+
+void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index 6edc141205a95..5780b877d1a45 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -24,27 +24,16 @@ namespace paddle {
 namespace platform {
 
 void CudaProfilerInit(std::string output_file, std::string output_mode,
-                      std::string config_file) {
-  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
-                 platform::errors::InvalidArgument(
-                     "Unsupported cuda profiler output mode, expect `kvp` or "
-                     "`csv`, but received `%s`.",
-                     output_mode));
-  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
-}
+                      std::string config_file);
 
-void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); }
+void CudaProfilerStart();
 
-void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); }
+void CudaProfilerStop();
 
 #ifndef _WIN32
-void CudaNvtxRangePush(std::string name) {
-  dynload::nvtxRangePushA(name.c_str());
-}
+void CudaNvtxRangePush(std::string name);
 
-void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
+void CudaNvtxRangePop();
 #endif
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
index 486b3346c3760..318c85ee484be 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
@@ -44,6 +44,9 @@ inline cudnnDataType_t ToCudnnDataType(const T& t) {
 
 inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
   std::vector<int> transformed_dims(dims.begin(), dims.end());
+  if (dims.size() < 4) {
+    return transformed_dims;
+  }
   int H, W, D, C;
   if (dims.size() == 4) {
     H = dims[1];
@@ -155,8 +158,8 @@ class TensorDescriptor {
         dims_with_group.data(), strides.data()));
   }
 
-  void set(const Tensor& tensor, const cudnnTensorFormat_t format) {
-    auto dims = framework::vectorize<int>(tensor.dims());
+  void set(const std::vector<int>& dims, const cudnnTensorFormat_t format,
+           const cudnnDataType_t dtype) {
     std::vector<int> transformed_dims;
     if (format == CUDNN_TENSOR_NHWC) {
       transformed_dims = TransformDimOrder(dims);
@@ -164,8 +167,14 @@ class TensorDescriptor {
       transformed_dims = dims;
     }
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
-        desc_.get(), format, ToCudnnDataType(tensor.type()),
-        transformed_dims.size(), transformed_dims.data()));
+        desc_.get(), format, dtype, transformed_dims.size(),
+        transformed_dims.data()));
+  }
+
+  void set(const Tensor& tensor, const cudnnTensorFormat_t format) {
+    auto dims = framework::vectorize<int>(tensor.dims());
+    auto dtype = ToCudnnDataType(tensor.type());
+    set(dims, format, dtype);
   }
 
  private:
@@ -191,9 +200,8 @@ class FilterDescriptor {
   T* desc() { return desc_.get(); }
   T* desc() const { return desc_.get(); }
 
-  void set(const Tensor& tensor, const cudnnTensorFormat_t format,
-           const int groups = 1) {
-    auto dims = framework::vectorize<int>(tensor.dims());
+  void set(const std::vector<int>& dims, const cudnnTensorFormat_t format,
+           const cudnnDataType_t dtype, const int groups = 1) {
     std::vector<int> transformed_dims;
     if (format == CUDNN_TENSOR_NHWC) {
       transformed_dims = TransformDimOrder(dims);
@@ -204,8 +212,15 @@ class FilterDescriptor {
       transformed_dims[1] = transformed_dims[1] / groups;
     }
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
-        desc_.get(), ToCudnnDataType(tensor.type()), format,
-        transformed_dims.size(), transformed_dims.data()));
+        desc_.get(), dtype, format, transformed_dims.size(),
+        transformed_dims.data()));
+  }
+
+  void set(const Tensor& tensor, const cudnnTensorFormat_t format,
+           const int groups = 1) {
+    auto dims = framework::vectorize<int>(tensor.dims());
+    auto dtype = ToCudnnDataType(tensor.type());
+    set(dims, format, dtype, groups);
   }
 
  private:
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 1bd46c0bfafaa..8160a06ddea5d 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -494,6 +494,16 @@ class DeviceTracerImpl : public DeviceTracer {
   }
 
   proto::Profile GenProfile(const std::string &profile_path) {
+    proto::Profile profile_pb = this->GetProfile();
+    std::ofstream profile_f;
+    profile_f.open(profile_path,
+                   std::ios::out | std::ios::trunc | std::ios::binary);
+    profile_pb.SerializeToOstream(&profile_f);
+    profile_f.close();
+    return profile_pb;
+  }
+
+  proto::Profile GetProfile() {
     int miss = 0, find = 0;
     std::lock_guard<std::mutex> l(trace_mu_);
     proto::Profile profile_pb;
@@ -601,12 +611,6 @@ class DeviceTracerImpl : public DeviceTracer {
         event->set_thread_id(r.thread_id);
       }
     }
-
-    std::ofstream profile_f;
-    profile_f.open(profile_path,
-                   std::ios::out | std::ios::trunc | std::ios::binary);
-    profile_pb.SerializeToOstream(&profile_f);
-    profile_f.close();
     return profile_pb;
   }
 
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 9bae7a870522c..ef06d0d609e1a 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -126,6 +126,9 @@ class DeviceTracer {
                                 int64_t device_id, int64_t stream_id,
                                 uint32_t correlation_id) = 0;
 
+  // Get a proto after done
+  virtual proto::Profile GetProfile() = 0;
+
   // Generate a proto after done (Disabled).
   virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
 
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index ac98ff02035bd..c0d4b349a9e09 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
 
 if (NOT WITH_NV_JETSON)
     list(APPEND CUDA_SRCS nvjpeg.cc)
@@ -45,4 +45,7 @@ endif()
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
+
+cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader)
+add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 96e16894c78c6..ab30ab307a9c7 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -89,7 +89,9 @@ extern void *cublas_dso_handle;
   __macro(cublasDgetrfBatched);           \
   __macro(cublasDgetriBatched);           \
   __macro(cublasSmatinvBatched);          \
-  __macro(cublasDmatinvBatched);
+  __macro(cublasDmatinvBatched);          \
+  __macro(cublasSgetrsBatched);           \
+  __macro(cublasDgetrsBatched);
 
 CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 4828a97e4df4d..3420c38fe9639 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -180,7 +180,18 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 8000
-#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) __macro(cudnnSetRNNDescriptor_v8);
+#define CUDNN_DNN_ROUTINE_EACH_R8(__macro)            \
+  __macro(cudnnSetRNNDescriptor_v8);                  \
+  __macro(cudnnCreateFusedOpsPlan);                   \
+  __macro(cudnnCreateFusedOpsConstParamPack);         \
+  __macro(cudnnCreateFusedOpsVariantParamPack);       \
+  __macro(cudnnDestroyFusedOpsPlan);                  \
+  __macro(cudnnDestroyFusedOpsConstParamPack);        \
+  __macro(cudnnDestroyFusedOpsVariantParamPack);      \
+  __macro(cudnnFusedOpsExecute);                      \
+  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
+  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
+  __macro(cudnnMakeFusedOpsPlan);
 CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
diff --git a/paddle/fluid/platform/dynload/cufft.cc b/paddle/fluid/platform/dynload/cufft.cc
new file mode 100644
index 0000000000000..a125fb7226050
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cufft.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/cufft.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cufft_dso_flag;
+void* cufft_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUFFT() {
+  std::call_once(cufft_dso_flag,
+                 []() { cufft_dso_handle = GetCUFFTDsoHandle(); });
+  return cufft_dso_handle != nullptr;
+}
+
+void EnforceCUFFTLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      cufft_dso_handle,
+      platform::errors::PreconditionNotMet(
+          "Cannot load cufft shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cufft.h b/paddle/fluid/platform/dynload/cufft.h
new file mode 100644
index 0000000000000..ef924d7b5ee86
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cufft.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cufft.h>
+#include <cufftXt.h>
+#include <glog/logging.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cufft_dso_flag;
+extern void* cufft_dso_handle;
+extern bool HasCUFFT();
+
+extern void EnforceCUFFTLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
+      using cufft_func = decltype(&::__name);                              \
+      std::call_once(cufft_dso_flag, []() {                                \
+        cufft_dso_handle = paddle::platform::dynload::GetCUFFTDsoHandle(); \
+      });                                                                  \
+      EnforceCUFFTLoaded(#__name);                                         \
+      static void* p_##__name = dlsym(cufft_dso_handle, #__name);          \
+      return reinterpret_cast<cufft_func>(p_##__name)(args...);            \
+    }                                                                      \
+  };                                                                       \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cufft functions in HPPL
+ * different cufft version has different interfaces
+ **/
+#define CUFFT_FFT_ROUTINE_EACH(__macro)  \
+  __macro(cufftPlan1d);                  \
+  __macro(cufftPlan2d);                  \
+  __macro(cufftPlan3d);                  \
+  __macro(cufftPlanMany);                \
+  __macro(cufftMakePlan1d);              \
+  __macro(cufftMakePlan2d);              \
+  __macro(cufftMakePlan3d);              \
+  __macro(cufftMakePlanMany);            \
+  __macro(cufftMakePlanMany64);          \
+  __macro(cufftGetSizeMany64);           \
+  __macro(cufftEstimate1d);              \
+  __macro(cufftEstimate2d);              \
+  __macro(cufftEstimate3d);              \
+  __macro(cufftEstimateMany);            \
+  __macro(cufftCreate);                  \
+  __macro(cufftGetSize1d);               \
+  __macro(cufftGetSize2d);               \
+  __macro(cufftGetSize3d);               \
+  __macro(cufftGetSizeMany);             \
+  __macro(cufftGetSize);                 \
+  __macro(cufftSetWorkArea);             \
+  __macro(cufftSetAutoAllocation);       \
+  __macro(cufftExecC2C);                 \
+  __macro(cufftExecR2C);                 \
+  __macro(cufftExecC2R);                 \
+  __macro(cufftExecZ2Z);                 \
+  __macro(cufftExecD2Z);                 \
+  __macro(cufftExecZ2D);                 \
+  __macro(cufftSetStream);               \
+  __macro(cufftDestroy);                 \
+  __macro(cufftGetVersion);              \
+  __macro(cufftGetProperty);             \
+  __macro(cufftXtSetGPUs);               \
+  __macro(cufftXtMalloc);                \
+  __macro(cufftXtMemcpy);                \
+  __macro(cufftXtFree);                  \
+  __macro(cufftXtSetWorkArea);           \
+  __macro(cufftXtExecDescriptorC2C);     \
+  __macro(cufftXtExecDescriptorR2C);     \
+  __macro(cufftXtExecDescriptorC2R);     \
+  __macro(cufftXtExecDescriptorZ2Z);     \
+  __macro(cufftXtExecDescriptorD2Z);     \
+  __macro(cufftXtExecDescriptorZ2D);     \
+  __macro(cufftXtQueryPlan);             \
+  __macro(cufftXtSetCallback);           \
+  __macro(cufftXtClearCallback);         \
+  __macro(cufftXtSetCallbackSharedSize); \
+  __macro(cufftXtMakePlanMany);          \
+  __macro(cufftXtGetSizeMany);           \
+  __macro(cufftXtExec);                  \
+  __macro(cufftXtExecDescriptor);        \
+  __macro(cufftXtSetWorkAreaPolicy);
+
+CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
index 3f41f6978d059..98841949676e4 100644
--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -41,6 +41,7 @@ extern void *cusparse_dso_handle;
   };                                                                 \
   extern DynLoad__##__name __name
 
+#ifndef _WIN32
 #if CUDA_VERSION >= 11020
 #define CUSPARSE_ROUTINE_EACH(__macro) \
   __macro(cusparseCreate);             \
@@ -57,6 +58,7 @@ extern void *cusparse_dso_handle;
 
 CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
 #endif
+#endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 37932600e7a7e..a83f085f7d2d8 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -51,6 +51,8 @@ DEFINE_string(
 
 DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
 
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
 DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
 
 #ifdef PADDLE_WITH_HIP
@@ -109,6 +111,9 @@ static constexpr char* win_cusolver_lib =
 static constexpr char* win_cusparse_lib =
     "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
+static constexpr char* win_cufft_lib =
+    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll";
 #else
 static constexpr char* win_curand_lib =
     "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
@@ -122,6 +127,9 @@ static constexpr char* win_cusolver_lib =
 static constexpr char* win_cusparse_lib =
     "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cufft_lib =
+    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
 #endif  // CUDA_VERSION
 #endif
 
@@ -472,6 +480,16 @@ void* GetMKLMLDsoHandle() {
 #endif
 }
 
+void* GetLAPACKDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
+#endif
+}
+
 void* GetOpDsoHandle(const std::string& dso_name) {
   return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
 }
@@ -489,6 +507,17 @@ void* GetNvtxDsoHandle() {
 #endif
 }
 
+void* GetCUFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cufft_lib, true,
+                                    {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index e282c033c4451..82c36d9e224f4 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -39,8 +39,10 @@ void* GetNCCLDsoHandle();
 void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
+void* GetLAPACKDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
 void* GetNvtxDsoHandle();
+void* GetCUFFTDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/lapack.cc b/paddle/fluid/platform/dynload/lapack.cc
new file mode 100644
index 0000000000000..eeebe240874f2
--- /dev/null
+++ b/paddle/fluid/platform/dynload/lapack.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/lapack.h"
+#include <mutex>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+LAPACK_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/lapack.h b/paddle/fluid/platform/dynload/lapack.h
new file mode 100644
index 0000000000000..9b4dd3d9e3ce5
--- /dev/null
+++ b/paddle/fluid/platform/dynload/lapack.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <complex>
+#include <mutex>
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+// Note(zhouwei): because lapack doesn't provide appropriate header file.
+// should expose API statement yourself.
+
+// getrf_(For example)
+extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
+                        int *info);
+extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
+                        int *info);
+
+// evd
+extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
+                        int *lda, double *w, std::complex<double> *work,
+                        int *lwork, double *rwork, int *lrwork, int *iwork,
+                        int *liwork, int *info);
+extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
+                        int *lda, float *w, std::complex<float> *work,
+                        int *lwork, float *rwork, int *lrwork, int *iwork,
+                        int *liwork, int *info);
+extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
+                        double *w, double *work, int *lwork, int *iwork,
+                        int *liwork, int *info);
+extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
+                        float *w, float *work, int *lwork, int *iwork,
+                        int *liwork, int *info);
+
+// geev
+extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
+                       double *wr, double *wi, double *vl, int *ldvl,
+                       double *vr, int *ldvr, double *work, int *lwork,
+                       int *info);
+extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
+                       float *wr, float *wi, float *vl, int *ldvl, float *vr,
+                       int *ldvr, float *work, int *lwork, int *info);
+extern "C" void zgeev_(char *jobvl, char *jobvr, int *n,
+                       std::complex<double> *a, int *lda,
+                       std::complex<double> *w, std::complex<double> *vl,
+                       int *ldvl, std::complex<double> *vr, int *ldvr,
+                       std::complex<double> *work, int *lwork, double *rwork,
+                       int *info);
+extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
+                       int *lda, std::complex<float> *w,
+                       std::complex<float> *vl, int *ldvl,
+                       std::complex<float> *vr, int *ldvr,
+                       std::complex<float> *work, int *lwork, float *rwork,
+                       int *info);
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag lapack_dso_flag;
+extern void *lapack_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                     \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
+      using lapackFunc = decltype(&::__name);                                \
+      std::call_once(lapack_dso_flag, []() {                                 \
+        lapack_dso_handle = paddle::platform::dynload::GetLAPACKDsoHandle(); \
+      });                                                                    \
+      static void *p_##_name = dlsym(lapack_dso_handle, #__name);            \
+      return reinterpret_cast<lapackFunc>(p_##_name)(args...);               \
+    }                                                                        \
+  };                                                                         \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
+  DYNAMIC_LOAD_LAPACK_WRAP(__name)
+
+#define LAPACK_ROUTINE_EACH(__macro) \
+  __macro(dgetrf_);                  \
+  __macro(sgetrf_);                  \
+  __macro(zheevd_);                  \
+  __macro(cheevd_);                  \
+  __macro(dsyevd_);                  \
+  __macro(ssyevd_);                  \
+  __macro(dgeev_);                   \
+  __macro(sgeev_);                   \
+  __macro(zgeev_);                   \
+  __macro(cgeev_);
+
+LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
+
+#undef DYNAMIC_LOAD_LAPACK_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 52be0c805bbd2..c420a5a64be06 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -101,6 +101,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/type_defs.h"
 #endif
+#include "paddle/fluid/platform/flags.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index ed465c9ea2eb8..b97c3106439be 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -12,11 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #endif
 
+namespace paddle {
+namespace platform {
+
+const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
+  return *GetMutableExportedFlagInfoMap();
+}
+
+ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
+  static ExportedFlagInfoMap g_exported_flag_info_map;
+  return &g_exported_flag_info_map;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
 /**
  * NOTE(paddle-dev): This file is designed to define all public FLAGS.
  */
@@ -30,8 +45,8 @@
  * instance to 2
  * Note:
  */
-DEFINE_int32(paddle_num_threads, 1,
-             "Number of threads for each paddle instance.");
+PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads, 1,
+                             "Number of threads for each paddle instance.");
 
 /**
  * Operator related FLAG
@@ -41,9 +56,10 @@ DEFINE_int32(paddle_num_threads, 1,
  * Example:
  * Note: Used to debug. Checking whether operator produce NAN/INF or not.
  */
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
+PADDLE_DEFINE_EXPORTED_bool(
+    check_nan_inf, false,
+    "Checking whether operator produce NAN/INF or not. It will be "
+    "extremely slow so please use this flag wisely.");
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
@@ -58,7 +74,7 @@ DEFINE_bool(check_nan_inf, false,
  * Example:
  * Note: whether to use Tensor Core, faster but it may loss precision.
  */
-DEFINE_bool(
+PADDLE_DEFINE_EXPORTED_bool(
     enable_cublas_tensor_op_math, false,
     "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
     "but it may loss precision. Currently, There are two CUDA libraries that"
@@ -77,30 +93,34 @@ DEFINE_bool(
  * cards
  * Note: A list of device ids separated by comma, like: 0,1,2,3
  */
-DEFINE_string(selected_gpus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (GPU). If you want to use "
-              "all visible devices, set this to empty string. NOTE: the "
-              "reason of doing this is that we want to use P2P communication"
-              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
-              "share-memory only.");
+PADDLE_DEFINE_EXPORTED_string(
+    selected_gpus, "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (GPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
+    "share-memory only.");
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
-DEFINE_string(selected_npus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (NPU). If you want to use "
-              "all visible devices, set this to empty string.");
-DEFINE_bool(hccl_check_nan, true,
-            "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
-            "core when meets Nan value");
-DEFINE_string(
+PADDLE_DEFINE_EXPORTED_string(
+    selected_npus, "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (NPU). If you want to use "
+    "all visible devices, set this to empty string.");
+PADDLE_DEFINE_EXPORTED_bool(
+    hccl_check_nan, true,
+    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
+    "core when meets Nan value");
+PADDLE_DEFINE_EXPORTED_string(
     npu_config_path, "",
     "The absolute path of configuration json file, like: /tmp/config.json. "
     "If proveided, it will be passed to aclInit().");
-DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!");
+PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1,
+                             "set minmum loss scaling value!");
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -113,10 +133,11 @@ DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!");
  * Note: whether to use deterministic algorithm in cudnn.
  *       If true, it will slow down some operators such as conv and pooling.
  */
-DEFINE_bool(cudnn_deterministic, false,
-            "Whether allow using an autotuning algorithm for convolution "
-            "operator. The autotuning algorithm may be non-deterministic. If "
-            "true, the algorithm is deterministic.");
+PADDLE_DEFINE_EXPORTED_bool(
+    cudnn_deterministic, false,
+    "Whether allow using an autotuning algorithm for convolution "
+    "operator. The autotuning algorithm may be non-deterministic. If "
+    "true, the algorithm is deterministic.");
 
 /**
  * CUDNN related FLAG
@@ -130,9 +151,10 @@ DEFINE_bool(cudnn_deterministic, false,
  * increased.
  *       Users need to balance memory and speed.
  */
-DEFINE_uint64(conv_workspace_size_limit,
-              paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
-              "cuDNN convolution workspace limit in MB unit.");
+PADDLE_DEFINE_EXPORTED_uint64(
+    conv_workspace_size_limit,
+    paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
+    "cuDNN convolution workspace limit in MB unit.");
 
 /**
  * CUDNN related FLAG
@@ -148,9 +170,10 @@ DEFINE_uint64(conv_workspace_size_limit,
  *       layer specification. Once you change the layer specifications
  *       (such as batch size, feature map size), it will search again.
  */
-DEFINE_bool(cudnn_exhaustive_search, false,
-            "Whether enable exhaustive search for cuDNN convolution or "
-            "not, default is False.");
+PADDLE_DEFINE_EXPORTED_bool(
+    cudnn_exhaustive_search, false,
+    "Whether enable exhaustive search for cuDNN convolution or "
+    "not, default is False.");
 
 /**
  * CUDNN related FLAG
@@ -160,9 +183,9 @@ DEFINE_bool(cudnn_exhaustive_search, false,
  * Example:
  * Note: only used to predict for advanced developer
  */
-DEFINE_int64(cudnn_exhaustive_search_times, -1,
-             "Exhaustive search times for cuDNN convolution, "
-             "default is -1, not exhaustive search");
+PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times, -1,
+                             "Exhaustive search times for cuDNN convolution, "
+                             "default is -1, not exhaustive search");
 
 /**
  * CUDNN related FLAG
@@ -180,9 +203,10 @@ DEFINE_int64(cudnn_exhaustive_search_times, -1,
  * certain
  *       input data range.
  */
-DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
-            "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
-            "batch_norm, default is False.");
+PADDLE_DEFINE_EXPORTED_bool(
+    cudnn_batchnorm_spatial_persistent, false,
+    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
+    "batch_norm, default is False.");
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -197,7 +221,7 @@ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
  *       https://github.com/PaddlePaddle/Paddle/issues/15049
  *       If you want to change this default value, why?(gongwb)
  */
-DEFINE_bool(
+PADDLE_DEFINE_EXPORTED_bool(
     sync_nccl_allreduce, true,
     "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
     "after allreduce, this mode can get better performance in some scenarios.");
@@ -215,11 +239,12 @@ DEFINE_bool(
  *       into the queue, and then the communicator takes the gradients out
  *       of the queue and sends them after merging.
  */
-DEFINE_int32(communicator_max_merge_var_num, 20,
-             "max var num to merge and send");
-DEFINE_bool(communicator_is_sgd_optimizer, true,
-            "gradient sent to the server is the sum of the gradients "
-            "calculated by each thread if optimizer is sgd");
+PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num, 20,
+                             "max var num to merge and send");
+PADDLE_DEFINE_EXPORTED_bool(
+    communicator_is_sgd_optimizer, true,
+    "gradient sent to the server is the sum of the gradients "
+    "calculated by each thread if optimizer is sgd");
 /**
  * Distributed related FLAG
  * Name: FLAGS_communicator_send_queue_size
@@ -233,8 +258,8 @@ DEFINE_bool(communicator_is_sgd_optimizer, true,
  *       space. It is used to avoid training much faster than communication,
  *       so that too many gradients are not sent out in time.
  */
-DEFINE_int32(communicator_send_queue_size, 20,
-             "queue size to recv gradient before send");
+PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size, 20,
+                             "queue size to recv gradient before send");
 #endif
 
 /**
@@ -246,8 +271,9 @@ DEFINE_int32(communicator_send_queue_size, 20,
  * Note: Control the number of threads used for distributed modules.
  *       If it is not set, it is set to a hard thread.
  */
-DEFINE_int32(dist_threadpool_size, 0,
-             "number of threads used for distributed executed.");
+PADDLE_DEFINE_EXPORTED_int32(
+    dist_threadpool_size, 0,
+    "number of threads used for distributed executed.");
 
 /**
  * Garbage collector related FLAG
@@ -272,7 +298,7 @@ static const double kDefaultEagerDeleteTensorGB = -1;
 static const double kDefaultEagerDeleteTensorGB = 0;
 #endif
 
-DEFINE_double(
+PADDLE_DEFINE_EXPORTED_double(
     eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB,
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
@@ -289,9 +315,10 @@ DEFINE_double(
  *       has finished, which will make the garbage collection strategy faster.
  *       Only works when garbage collection strategy is enabled.
  */
-DEFINE_bool(fast_eager_deletion_mode, true,
-            "Fast eager deletion mode. If enabled, memory would release "
-            "immediately without waiting GPU kernel ends.");
+PADDLE_DEFINE_EXPORTED_bool(
+    fast_eager_deletion_mode, true,
+    "Fast eager deletion mode. If enabled, memory would release "
+    "immediately without waiting GPU kernel ends.");
 
 /**
  * Memory related FLAG
@@ -311,11 +338,12 @@ DEFINE_bool(fast_eager_deletion_mode, true,
  *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
  *       The flag is only valid when running parallel data compilers.
  */
-DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
-              "Fraction of eager deletion. If less than 1.0, all variables in "
-              "the program would be sorted according to its memory size, and "
-              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
-              "variables would be deleted.");
+PADDLE_DEFINE_EXPORTED_double(
+    memory_fraction_of_eager_deletion, 1.0,
+    "Fraction of eager deletion. If less than 1.0, all variables in "
+    "the program would be sorted according to its memory size, and "
+    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+    "variables would be deleted.");
 
 /**
  * Allocator related FLAG
@@ -331,7 +359,7 @@ static constexpr char kDefaultAllocatorStrategy[] = "naive_best_fit";
 #else
 static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
 #endif
-DEFINE_string(
+PADDLE_DEFINE_EXPORTED_string(
     allocator_strategy, kDefaultAllocatorStrategy,
     "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
     "naive_best_fit means the original pre-allocated allocator of Paddle. "
@@ -358,9 +386,9 @@ DEFINE_string(
  *       size as the memory block will be allocated from the CUDA pinned
  *       request util the CPU does not have enough memory.
  */
-DEFINE_double(fraction_of_cpu_memory_to_use, 1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
+PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use, 1,
+                              "Default use 100% of CPU memory for PaddlePaddle,"
+                              "reserve the rest for page tables, etc");
 
 /**
  * Memory related FLAG
@@ -374,8 +402,9 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
  *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
  *       as memory block sizes.
  */
-DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
+PADDLE_DEFINE_EXPORTED_uint64(
+    initial_cpu_memory_in_mb, 500ul,
+    "Initial CPU memory for PaddlePaddle, in MD unit.");
 
 /**
  * Memory related FLAG
@@ -390,7 +419,7 @@ DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
  *       size as the memory block will be allocated from the CPU
  *       request util the CPU does not have enough memory.
  */
-DEFINE_double(
+PADDLE_DEFINE_EXPORTED_double(
     fraction_of_cuda_pinned_memory_to_use, 0.5,
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
     "reserve the rest for page tables, etc");
@@ -425,12 +454,13 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
 // which may lead to insufficient memory left for paddle
 constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
 #endif
-DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
-              "Allocate a trunk of gpu memory that is this fraction of the "
-              "total gpu memory size. Future memory usage will be allocated "
-              "from the trunk. If the trunk doesn't have enough gpu memory, "
-              "additional trunks of the same size will be requested from gpu "
-              "until the gpu has no memory left for another trunk.");
+PADDLE_DEFINE_EXPORTED_double(
+    fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
+    "Allocate a trunk of gpu memory that is this fraction of the "
+    "total gpu memory size. Future memory usage will be allocated "
+    "from the trunk. If the trunk doesn't have enough gpu memory, "
+    "additional trunks of the same size will be requested from gpu "
+    "until the gpu has no memory left for another trunk.");
 
 /**
  * Memory related FLAG
@@ -444,7 +474,7 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
  *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
  *       the GPU has no remaining memory.
  */
-DEFINE_uint64(
+PADDLE_DEFINE_EXPORTED_uint64(
     initial_gpu_memory_in_mb, 0ul,
     "Allocate a trunk of gpu memory whose byte size is specified by "
     "the flag. Future memory usage will be allocated from the "
@@ -466,18 +496,20 @@ DEFINE_uint64(
  * Note: If the allocated GPU memory blocks are exhausted,
  *       additional GPU memory blocks are reallocated
  */
-DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
-              "If this flag is set, Paddle will reallocate the gpu memory with "
-              "size specified by this flag. Else Paddle will reallocate by "
-              "FLAGS_fraction_of_gpu_memory_to_use");
-
-DEFINE_uint64(gpu_memory_limit_mb, 0UL,
-              "The maximum gpu memory limit that the process can allocate. "
-              "If it is equal to 0, there would be no limit and all gpu memory "
-              "would be available to the process. If it is larger than 0, "
-              "the process would raise out of memory error if the allocated "
-              "memory exceeds the limit even though there is available "
-              "memory on the gpu card. The unit is MB and default value is 0.");
+PADDLE_DEFINE_EXPORTED_uint64(
+    reallocate_gpu_memory_in_mb, 0ul,
+    "If this flag is set, Paddle will reallocate the gpu memory with "
+    "size specified by this flag. Else Paddle will reallocate by "
+    "FLAGS_fraction_of_gpu_memory_to_use");
+
+PADDLE_DEFINE_EXPORTED_uint64(
+    gpu_memory_limit_mb, 0UL,
+    "The maximum gpu memory limit that the process can allocate. "
+    "If it is equal to 0, there would be no limit and all gpu memory "
+    "would be available to the process. If it is larger than 0, "
+    "the process would raise out of memory error if the allocated "
+    "memory exceeds the limit even though there is available "
+    "memory on the gpu card. The unit is MB and default value is 0.");
 
 #endif
 
@@ -489,11 +521,12 @@ DEFINE_uint64(gpu_memory_limit_mb, 0UL,
  * Example:
  * Note:
  */
-DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
-              "The memory up limit of sub-scopes of local execution scope for "
-              "each CUDAPlace. If you don't need to limit the memory, "
-              "you should set FLAGS_local_exe_sub_scope_limit=-1. "
-              "The default value is 256 MBytes.");
+PADDLE_DEFINE_EXPORTED_double(
+    local_exe_sub_scope_limit, 256.0,  // MBytes
+    "The memory up limit of sub-scopes of local execution scope for "
+    "each CUDAPlace. If you don't need to limit the memory, "
+    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
+    "The default value is 256 MBytes.");
 
 /**
  * MKLDNN related FLAG
@@ -503,7 +536,7 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
  * Example:
  * Note:
  */
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
 
 /**
  * Debug related FLAG
@@ -525,7 +558,7 @@ static const int32_t kDefaultCallStackLevel = 2;
 static const int32_t kDefaultCallStackLevel = 1;
 #endif
 
-DEFINE_int32(
+PADDLE_DEFINE_EXPORTED_int32(
     call_stack_level, kDefaultCallStackLevel,
     "Determine the call stack to print when error or exeception happens."
     // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
@@ -545,9 +578,9 @@ DEFINE_int32(
  * Note: If True, gradients are summed by the reverse order of
  * the forward execution sequence.
  */
-DEFINE_bool(sort_sum_gradient, false,
-            "Sum gradients by the reverse order of "
-            "the forward execution sequence.");
+PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient, false,
+                            "Sum gradients by the reverse order of "
+                            "the forward execution sequence.");
 
 /**
  * Performance related FLAG
@@ -557,7 +590,7 @@ DEFINE_bool(sort_sum_gradient, false,
  * Example:
  * Note: The maximum number of inplace grad_add.
  */
-DEFINE_int32(
+PADDLE_DEFINE_EXPORTED_int32(
     max_inplace_grad_add, 0,
     "The maximum number of inplace grad_add. When doing "
     "gradient accumulation, if the number of gradients need to that "
@@ -572,8 +605,8 @@ DEFINE_int32(
  * Example:
  * Note: Holds list of operation types with OneDNN kernels to be enabled.
  */
-DEFINE_string(tracer_mkldnn_ops_on, "",
-              "List of OneDNN operation types to be turned on");
+PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on, "",
+                              "List of OneDNN operation types to be turned on");
 
 /**
  * Debug related FLAG
@@ -583,8 +616,9 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
  * Example:
  * Note: Holds list of operation types with OneDNN kernels to be disabled.
  */
-DEFINE_string(tracer_mkldnn_ops_off, "",
-              "List of OneDNN operation types to be turned off");
+PADDLE_DEFINE_EXPORTED_string(
+    tracer_mkldnn_ops_off, "",
+    "List of OneDNN operation types to be turned off");
 
 /**
  * Debug related FLAG
@@ -595,8 +629,9 @@ DEFINE_string(tracer_mkldnn_ops_off, "",
  * Note: Check kernel launch status after every kernel compute.
  */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-DEFINE_bool(check_kernel_launch, false,
-            "Check kernel launch status after every kernel compute");
+PADDLE_DEFINE_EXPORTED_bool(
+    check_kernel_launch, false,
+    "Check kernel launch status after every kernel compute");
 #endif
 
 /**
@@ -608,7 +643,8 @@ DEFINE_bool(check_kernel_launch, false,
  * Note: Disable cudnn in conv2d.
  */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
+PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false,
+                            "Disable cudnn in conv2d");
 #endif
 
 /**
@@ -621,8 +657,8 @@ DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
  */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
     defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP)
-DEFINE_int32(get_host_by_name_time, 120,
-             "The maximum time for get host by name time");
+PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120,
+                             "The maximum time for get host by name time");
 #endif
 
 /**
@@ -634,6 +670,6 @@ DEFINE_int32(get_host_by_name_time, 120,
  *          program when using Fleet APIs.
  * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
  */
-DEFINE_bool(
+PADDLE_DEFINE_EXPORTED_bool(
     apply_pass_to_program, false,
     "It controls whether to apply IR pass to program when using Fleet APIs");
diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h
new file mode 100644
index 0000000000000..b9d78c2e9dc39
--- /dev/null
+++ b/paddle/fluid/platform/flags.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <type_traits>
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace platform {
+
+struct FlagInfo {
+  using ValueType =
+      boost::variant<bool, int32_t, int64_t, uint64_t, double, std::string>;
+  std::string name;
+  mutable void *value_ptr;
+  ValueType default_value;
+  std::string doc;
+  bool is_writable;
+};
+
+using ExportedFlagInfoMap = std::map<std::string, FlagInfo>;
+const ExportedFlagInfoMap &GetExportedFlagInfoMap();
+ExportedFlagInfoMap *GetMutableExportedFlagInfoMap();
+
+#define __PADDLE_DEFINE_EXPORTED_FLAG(__name, __is_writable, __cpp_type,      \
+                                      __gflag_type, __default_value, __doc)   \
+  DEFINE_##__gflag_type(__name, __default_value, __doc);                      \
+  struct __PaddleRegisterFlag_##__name {                                      \
+    __PaddleRegisterFlag_##__name() {                                         \
+      using FlagDeclaredType =                                                \
+          typename std::remove_reference<decltype(FLAGS_##__name)>::type;     \
+      static_assert(std::is_same<FlagDeclaredType, ::std::string>::value ||   \
+                        std::is_arithmetic<FlagDeclaredType>::value,          \
+                    "FLAGS should be std::string or arithmetic type");        \
+      auto *instance = ::paddle::platform::GetMutableExportedFlagInfoMap();   \
+      auto &info = (*instance)[#__name];                                      \
+      info.name = #__name;                                                    \
+      info.value_ptr = &(FLAGS_##__name);                                     \
+      info.default_value = static_cast<__cpp_type>(__default_value);          \
+      info.doc = __doc;                                                       \
+      info.is_writable = __is_writable;                                       \
+    }                                                                         \
+    int Touch() const { return 0; }                                           \
+  };                                                                          \
+  static __PaddleRegisterFlag_##__name __PaddleRegisterFlag_instance##__name; \
+  int TouchPaddleFlagRegister_##__name() {                                    \
+    return __PaddleRegisterFlag_instance##__name.Touch();                     \
+  }                                                                           \
+  static_assert(std::is_same<__PaddleRegisterFlag_##__name,                   \
+                             ::__PaddleRegisterFlag_##__name>::value,         \
+                "FLAGS should define in global namespace")
+
+#define PADDLE_FORCE_LINK_FLAG(__name)           \
+  extern int TouchPaddleFlagRegister_##__name(); \
+  UNUSED static int __paddle_use_flag_##__name = \
+      TouchPaddleFlagRegister_##__name()
+
+#define PADDLE_DEFINE_EXPORTED_bool(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc)
+#define PADDLE_DEFINE_EXPORTED_READONLY_bool(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc)
+
+#define PADDLE_DEFINE_EXPORTED_int32(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc)
+
+#define PADDLE_DEFINE_EXPORTED_int64(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc)
+
+#define PADDLE_DEFINE_EXPORTED_uint64(name, default_value, doc)              \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, uint64_t, uint64, default_value, \
+                                doc)
+
+#define PADDLE_DEFINE_EXPORTED_double(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc)
+
+#define PADDLE_DEFINE_EXPORTED_string(name, default_value, doc)    \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, ::std::string, string, \
+                                default_value, doc)
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index afae046531143..290b3353ae54c 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -43,9 +43,10 @@ limitations under the License. */
 #endif
 
 DECLARE_int32(paddle_num_threads);
-DEFINE_int32(multiple_of_cupti_buffer_size, 1,
-             "Multiple of the CUPTI device buffer size. If the timestamps have "
-             "been dropped when you are profiling, try increasing this value.");
+PADDLE_DEFINE_EXPORTED_int32(
+    multiple_of_cupti_buffer_size, 1,
+    "Multiple of the CUPTI device buffer size. If the timestamps have "
+    "been dropped when you are profiling, try increasing this value.");
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 49160f9463240..1aa8c0cdb57f9 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/operator.h"
@@ -603,7 +604,6 @@ class MKLDNNHandler {
                 const std::string& base_key)
       : dev_ctx_(dev_ctx),
         engine_(engine),
-        key_common_(base_key),
         key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)) {
     platform::MKLDNNDeviceContext::tls().log_lib_version();
   }
@@ -789,7 +789,6 @@ class MKLDNNHandler {
  protected:
   const MKLDNNDeviceContext& dev_ctx_;
   mkldnn::engine engine_;
-  std::string key_common_;
   std::string key_;
 };
 
@@ -929,7 +928,6 @@ class BroadcastDataMKLDNNHandler
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr = output->mutable_data<T_out>(
         this->place_, this->fwd_pd_->dst_desc().get_size());
-    ;
     memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -942,7 +940,8 @@ class ReductionMKLDNNHandler
   ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
                          const float eps, const mkldnn::engine engine,
                          platform::Place cpu_place, const Tensor* x,
-                         const Tensor* y, std::vector<int64_t> y_tz)
+                         const Tensor* y, std::vector<int64_t> y_tz,
+                         const dnnl::primitive_attr& attr = NULL)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction>(engine,
                                                               cpu_place) {
     PADDLE_ENFORCE_EQ(
@@ -959,7 +958,10 @@ class ReductionMKLDNNHandler
     const auto y_md =
         memory::desc(y_tz, platform::MKLDNNGetDataType<T>(), x->format());
 
-    this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps);
+    if (attr)
+      this->AcquireForwardPrimitiveDescriptor(attr, algo, x_md, y_md, p, eps);
+    else
+      this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps);
   }
 };
 
@@ -981,8 +983,9 @@ class ActivationMKLDNNHandler
     if (ctx.Type() == "scale") {
       bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
       auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
-      alpha = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
-                                        : (float)*(scale_tensor->data<T>());
+      alpha = (scale_tensor == nullptr)
+                  ? ctx.Attr<float>("scale")
+                  : static_cast<float>(*(scale_tensor->data<T>()));
       beta = ctx.Attr<float>("bias");
       // if bias_after_scale == true
       //   out = scale*X + bias
@@ -1071,138 +1074,73 @@ class ActivationMKLDNNHandler
   }
 };
 
-class ReorderMKLDNNHandler : public MKLDNNHandler {
+class ReorderMKLDNNHandler {
  public:
   ReorderMKLDNNHandler(std::vector<int64_t>& dims,  // NOLINT
                        framework::proto::VarType::Type vtype,
-                       mkldnn::memory::data_type dtype,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        dims_(dims),
+                       mkldnn::memory::data_type dtype, mkldnn::engine engine)
+      : dims_(dims),
         vtype_(vtype),
         vtype_dst_(vtype),
         dtype_(dtype),
-        dtype_dst_(dtype) {}
+        dtype_dst_(dtype),
+        engine_(engine) {}
 
   ReorderMKLDNNHandler(std::vector<int64_t>& dims,  // NOLINT
                        framework::proto::VarType::Type vtype,
                        mkldnn::memory::data_type dtype,
                        framework::proto::VarType::Type vtype_dst,
                        mkldnn::memory::data_type dtype_dst,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        dims_(dims),
+                       mkldnn::engine engine)
+      : dims_(dims),
         vtype_(vtype),
         vtype_dst_(vtype_dst),
         dtype_(dtype),
-        dtype_dst_(dtype_dst) {}
+        dtype_dst_(dtype_dst),
+        engine_(engine) {}
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
       const MKLDNNMemoryFormat& fmt, void* ptr) {
-    return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p");
+    auto md = mkldnn::memory::desc(dims_, dtype_, fmt);
+    return std::make_shared<mkldnn::memory>(md, engine_, ptr);
   }
 
   std::shared_ptr<mkldnn::memory> AcquireSubmemory(
       const std::vector<int64_t>& dims, const std::vector<int64_t>& offset,
-      const std::shared_ptr<mkldnn::memory>& mem_p, int submemory_number = 0) {
-    std::string local_key = key_;
-    local_key.append("@submem")
-        .append(std::to_string(submemory_number))
-        .append("_p");
-
-    auto sub_mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (sub_mem_p == nullptr) {
-      auto sub_md = mem_p->get_desc().submemory_desc(dims, {offset});
-      sub_mem_p = std::make_shared<mkldnn::memory>(sub_md, engine_,
-                                                   mem_p->get_data_handle());
-      dev_ctx_.SetBlob(local_key, sub_mem_p);
-    } else {
-      sub_mem_p->set_data_handle(mem_p->get_data_handle());
-    }
+      const std::shared_ptr<mkldnn::memory>& mem_p) {
+    auto sub_md = mem_p->get_desc().submemory_desc(dims, {offset});
+    auto sub_mem_p = std::make_shared<mkldnn::memory>(sub_md, engine_,
+                                                      mem_p->get_data_handle());
     return sub_mem_p;
   }
 
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       framework::Tensor* output, const MKLDNNMemoryFormat& fmt,
       platform::Place place) {
-    auto local_key = key_ + "@user_dst_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt);
-      auto dst_data =
-          output->mutable_data(place, vtype_dst_, dst_md.get_size());
-
-      mem_p = std::make_shared<mkldnn::memory>(dst_md, engine_, dst_data);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      // Even if memory object exists , we may be using it for diffrent tensor
-      auto dst_data =
-          output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size());
-      mem_p->set_data_handle(dst_data);
-    }
-    return mem_p;
+    auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt);
+    auto dst_data = output->mutable_data(place, vtype_dst_, dst_md.get_size());
+    return std::make_shared<mkldnn::memory>(dst_md, engine_, dst_data);
   }
 
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       framework::Tensor* output, const std::vector<int64_t>& dims,
-      const int memory_number, const MKLDNNMemoryFormat& fmt,
-      platform::Place place) {
-    auto local_key =
-        key_ + "@user_dst_mem" + std::to_string(memory_number) + "_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt);
-      auto dst_data =
-          output->mutable_data(place, vtype_dst_, dst_md.get_size());
-
-      mem_p = std::make_shared<mkldnn::memory>(dst_md, engine_, dst_data);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      // Even if memory object exists , we may be using it for diffrent tensor
-      auto dst_data =
-          output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size());
-      mem_p->set_data_handle(dst_data);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::reorder> AcquireReorder(
-      std::shared_ptr<mkldnn::memory> dst_memory_p,
-      std::shared_ptr<mkldnn::memory> src_memory_p, int reorder_number) {
-    auto prim_key = key_ + "@reorder" + std::to_string(reorder_number) + "_p";
-    auto reorder_p =
-        std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
-    if (reorder_p == nullptr) {
-      reorder_p =
-          std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
-      dev_ctx_.SetBlob(prim_key, reorder_p);
-    }
-    return reorder_p;
+      const MKLDNNMemoryFormat& fmt, platform::Place place) {
+    auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt);
+    auto dst_data = output->mutable_data(place, vtype_dst_, dst_md.get_size());
+    return std::make_shared<mkldnn::memory>(dst_md, engine_, dst_data);
   }
 
   std::shared_ptr<mkldnn::reorder> AcquireReorder(
       std::shared_ptr<mkldnn::memory> dst_memory_p,
       std::shared_ptr<mkldnn::memory> src_memory_p) {
-    auto prim_key = key_ + "@reorder_p";
-    auto reorder_p =
-        std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
-    if (reorder_p == nullptr) {
-      reorder_p =
-          std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
-      dev_ctx_.SetBlob(prim_key, reorder_p);
-    }
-    return reorder_p;
+    return std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
   }
 
  private:
   std::vector<int64_t> dims_;
   framework::proto::VarType::Type vtype_, vtype_dst_;
   mkldnn::memory::data_type dtype_, dtype_dst_;
+  mkldnn::engine engine_;
 };
 
 template <typename T>
@@ -1436,42 +1374,34 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     // Conv PD has to be passed to Grad op that
     // may be exxecuted by diffrent thread, hence
     // for that one we use key that does not contain TID
-    const std::string key_conv_pd = key_common_ + "@conv_pd";
+    const std::string key_conv_pd = key_ + "@conv_pd";
 
     conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
         dev_ctx_.GetBlob(key_conv_pd));
 
     if (conv_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-
-      conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
-          dev_ctx_.GetBlob(key_conv_pd));
-      if (conv_pd_ == nullptr) {
-        mkldnn::memory::dims stride_dims = strides;
-        mkldnn::memory::dims dilations_dims = dilations;
-        auto mkldnn_paddings = ToMkldnnPadding(paddings);
-
-        auto conv_desc =
-            bias ? typename forward_t::desc(
-                       fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, *bias, dst, stride_dims, dilations_dims,
-                       mkldnn_paddings[0], mkldnn_paddings[1])
-                 : typename forward_t::desc(
-                       fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, dst, stride_dims, dilations_dims,
-                       mkldnn_paddings[0], mkldnn_paddings[1]);
-
-        mkldnn::primitive_attr conv_attr =
-            CreatePostOps(fuse_activation, fuse_alpha, fuse_beta,
-                          fuse_residual_conn, output_shift_scale, sum_scale);
-
-        conv_pd_.reset(new typename forward_t::primitive_desc(
-            conv_desc, conv_attr, engine));
-        // Save conv_pd/src_memory/weights_memory for backward pass
-        dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
-      }
+      mkldnn::memory::dims stride_dims = strides;
+      mkldnn::memory::dims dilations_dims = dilations;
+      auto mkldnn_paddings = ToMkldnnPadding(paddings);
+
+      auto conv_desc =
+          bias ? typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, *bias, dst, stride_dims, dilations_dims,
+                     mkldnn_paddings[0], mkldnn_paddings[1])
+               : typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, dst, stride_dims, dilations_dims,
+                     mkldnn_paddings[0], mkldnn_paddings[1]);
+
+      mkldnn::primitive_attr conv_attr =
+          CreatePostOps(fuse_activation, fuse_alpha, fuse_beta,
+                        fuse_residual_conn, output_shift_scale, sum_scale);
+
+      conv_pd_.reset(
+          new typename forward_t::primitive_desc(conv_desc, conv_attr, engine));
+      // Save conv_pd/src_memory/weights_memory for backward pass
+      dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
     }
 
     return conv_pd_;
@@ -1579,6 +1509,7 @@ static void SetDstMemoryQuantized(
   T* output_data = output->mutable_data<T>(ctx.GetPlace());
   const size_t dst_dims = dst_tz.size();
   MKLDNNMemoryFormat dst_fmt;
+
   PADDLE_ENFORCE_LE(dst_dims, 5, platform::errors::InvalidArgument(
                                      "Dst memory for quantization can not have "
                                      "dims > 5. But received dst_dims is %d.",
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 14c772d88897f..415babc9cb85e 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/place.h"
 
-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
+PADDLE_DEFINE_EXPORTED_bool(
+    benchmark, false,
+    "Doing memory benchmark. It will make deleting scope synchronized, "
+    "and add some memory usage logs."
+    "Default cuda is asynchronous device, set to True will"
+    "force op run in synchronous mode.");
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 9c33233e1f79a..40d9bb99f44f5 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -24,7 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/nvtx.h"
 #endif
 
-DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
+PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
+                            "Enable rpc profiler or not.");
 
 namespace paddle {
 namespace platform {
@@ -262,9 +263,40 @@ void DisableProfiler(EventSortingKey sorted_key,
 
   ParseEvents(all_events, true, sorted_key);
   ParseEvents(all_events, false, sorted_key);
-  if (VLOG_IS_ON(5)) {
-    std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
-    ParseMemEvents(all_mem_events);
+
+  std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
+  ParseMemEvents(all_mem_events);
+
+  ResetProfiler();
+  g_state = ProfilerState::kDisabled;
+  g_tracer_option = TracerOption::kDefault;
+  should_send_profile_state = true;
+}
+
+void CompleteProfilerEvents(proto::Profile *tracer_profile,
+                            std::vector<std::vector<Event>> *time_events,
+                            std::vector<std::vector<MemEvent>> *mem_events) {
+  SynchronizeAllDevice();
+  MemEvenRecorder::Instance().Flush();
+
+  std::lock_guard<std::mutex> l(profiler_mu);
+  if (g_state == ProfilerState::kDisabled) return;
+
+  // Mark the profiling stop.
+  Mark("_stop_profiler_");
+
+  DeviceTracer *tracer = GetDeviceTracer();
+  if (tracer->IsEnabled() && tracer_profile != nullptr) {
+    tracer->Disable();
+    tracer->GenEventKernelCudaElapsedTime();
+    *tracer_profile = tracer->GetProfile();
+  }
+
+  if (time_events != nullptr) {
+    *time_events = GetAllEvents();
+  }
+  if (mem_events != nullptr) {
+    *mem_events = GetMemEvents();
   }
 
   ResetProfiler();
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 512bbc195b5b2..fbae6165e313a 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -28,9 +28,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.pb.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+
 namespace paddle {
 namespace platform {
 
@@ -215,6 +218,11 @@ void EnableProfiler(ProfilerState state);
 void ResetProfiler();
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path);
+// Disable profiler but return events instead of print it.
+void CompleteProfilerEvents(proto::Profile* tracer_profile,
+                            std::vector<std::vector<Event>>* time_events,
+                            std::vector<std::vector<MemEvent>>* mem_events);
+
 // Test if the profiler is currently enabled.
 bool IsProfileEnabled();
 // Whether the trainer should send profiling state to PS.
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index ae4d75113cd06..a8438263cb97b 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -820,7 +820,6 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
   std::multimap<std::string, EventItem> child_map;
   size_t max_name_width = 0;
   OverHead overhead;
-
   AnalyzeEvent(analyze_events, &events_table, &child_map, sorted_func,
                sorted_by, &max_name_width, &overhead, merge_thread);
 
diff --git a/paddle/fluid/platform/xpu/xpu1_op_list.h b/paddle/fluid/platform/xpu/xpu1_op_list.h
index cdd60a856fbc9..c9545d675f90e 100644
--- a/paddle/fluid/platform/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu1_op_list.h
@@ -318,7 +318,10 @@ XPUOpMap& get_kl1_ops() {
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::UINT8, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}
+      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"iou_similarity",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}
       // AddMore
   };
 
diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h
index 5b9e1a34bfcd5..651243a4dfe66 100644
--- a/paddle/fluid/platform/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu2_op_list.h
@@ -107,7 +107,9 @@ XPUOpMap& get_kl2_ops() {
       {"transpose2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-
+      {"iou_similarity",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}
       // AddMore
   };
 
diff --git a/paddle/fluid/platform/xpu/xpu_info.cc b/paddle/fluid/platform/xpu/xpu_info.cc
index 6b8ab16b47d68..3f45286d8f202 100644
--- a/paddle/fluid/platform/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/xpu/xpu_info.cc
@@ -18,14 +18,15 @@ limitations under the License. */
 #include "paddle/fluid/platform/xpu/xpu_header.h"
 #include "paddle/fluid/string/split.h"
 
-DEFINE_string(selected_xpus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (XPU). If you want to use "
-              "all visible devices, set this to empty string. NOTE: the "
-              "reason of doing this is that we want to use P2P communication"
-              "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
-              "share-memory only.");
+PADDLE_DEFINE_EXPORTED_string(
+    selected_xpus, "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (XPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+    "share-memory only.");
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4ca4675883835..22778013f2390 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -6,7 +6,8 @@ include_directories(${PADDLE_SOURCE_DIR}/paddle/utils)
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
-  gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
+  gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
+  cost_model)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -17,6 +18,10 @@ if (WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
 endif()
 
+if (WITH_GPU)
+  set(PYBIND_DEPS ${PYBIND_DEPS} cuda_profiler)
+endif()
+
 if (WITH_NCCL OR WITH_RCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
@@ -55,6 +60,7 @@ set(PYBIND_SRCS
   data_set_py.cc
   imperative.cc
   ir.cc
+  bind_cost_model.cc
   inference_api.cc
   compatible.cc
   io.cc
diff --git a/paddle/fluid/pybind/bind_cost_model.cc b/paddle/fluid/pybind/bind_cost_model.cc
new file mode 100644
index 0000000000000..a4a40f1fd02c9
--- /dev/null
+++ b/paddle/fluid/pybind/bind_cost_model.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/bind_cost_model.h"
+
+#include <pybind11/stl.h>
+#include "paddle/fluid/framework/ir/cost_model.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace py = pybind11;
+using paddle::framework::CostData;
+using paddle::framework::CostModel;
+using paddle::framework::ProgramDesc;
+
+namespace paddle {
+namespace pybind {
+
+void BindCostModel(py::module* m) {
+  py::class_<CostData>(*m, "CostData")
+      .def(py::init<>())
+      .def("get_whole_time_ms", &CostData::GetWholeTimeMs)
+      .def("get_op_time_ms", &CostData::GetOpTimeMs);
+
+  py::class_<CostModel>(*m, "CostModel")
+      .def(py::init<>())
+      .def("profile_measure",
+           [](CostModel& self, py::object py_main_program,
+              py::object py_startup_program, const std::string& device,
+              const std::vector<std::string>& fetch_cost_list) {
+             py::object py_main_program_desc = py_main_program.attr("desc");
+             ProgramDesc* main_program_desc =
+                 py_main_program_desc.cast<ProgramDesc*>();
+
+             py::object py_startup_program_desc =
+                 py_startup_program.attr("desc");
+             ProgramDesc* startup_program_desc =
+                 py_startup_program_desc.cast<ProgramDesc*>();
+             return self.ProfileMeasure(*main_program_desc,
+                                        *startup_program_desc, device,
+                                        fetch_cost_list);
+           });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/bind_cost_model.h b/paddle/fluid/pybind/bind_cost_model.h
new file mode 100644
index 0000000000000..2545ab675026c
--- /dev/null
+++ b/paddle/fluid/pybind/bind_cost_model.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace paddle {
+namespace pybind {
+
+void BindCostModel(pybind11::module *m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index 873476629cb78..d8142f717baed 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -76,6 +76,8 @@ void BindFleetWrapper(py::module* m) {
       .def("shrink_sparse_table", &framework::FleetWrapper::ShrinkSparseTable)
       .def("shrink_dense_table", &framework::FleetWrapper::ShrinkDenseTable)
       .def("print_table_stat", &framework::FleetWrapper::PrintTableStat)
+      .def("set_file_num_one_shard",
+           &framework::FleetWrapper::SetFileNumOneShard)
       .def("client_flush", &framework::FleetWrapper::ClientFlush)
       .def("load_from_paddle_model",
            &framework::FleetWrapper::LoadFromPaddleModel)
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 59c7628447479..e7f1bef4bee62 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -29,82 +29,17 @@
 #include "paddle/fluid/platform/macros.h"
 #include "pybind11/stl.h"
 
-// data processing
-DECLARE_bool(use_mkldnn);
-DECLARE_string(tracer_mkldnn_ops_on);
-DECLARE_string(tracer_mkldnn_ops_off);
-// debug
-DECLARE_bool(check_nan_inf);
-DECLARE_bool(cpu_deterministic);
-DECLARE_bool(enable_rpc_profiler);
-DECLARE_int32(multiple_of_cupti_buffer_size);
-DECLARE_bool(reader_queue_speed_test_mode);
-DECLARE_int32(call_stack_level);
-DECLARE_bool(sort_sum_gradient);
-DECLARE_bool(check_kernel_launch);
-// device management
-DECLARE_int32(paddle_num_threads);
-// executor
-DECLARE_bool(enable_parallel_graph);
-DECLARE_string(pe_profile_fname);
-DECLARE_string(print_sub_graph_dir);
-DECLARE_bool(use_ngraph);
-// memory management
-DECLARE_string(allocator_strategy);
-DECLARE_double(eager_delete_tensor_gb);
-DECLARE_double(fraction_of_cpu_memory_to_use);
-DECLARE_bool(free_idle_chunk);
-DECLARE_bool(free_when_no_cache_hit);
-DECLARE_int32(fuse_parameter_groups_size);
-DECLARE_double(fuse_parameter_memory_size);
-DECLARE_bool(init_allocated_mem);
-DECLARE_uint64(initial_cpu_memory_in_mb);
-DECLARE_double(memory_fraction_of_eager_deletion);
-DECLARE_bool(use_pinned_memory);
-DECLARE_bool(use_system_allocator);
-// others
-DECLARE_bool(benchmark);
-DECLARE_int32(inner_op_parallelism);
-DECLARE_int32(max_inplace_grad_add);
-DECLARE_string(tracer_profile_fname);
-DECLARE_bool(apply_pass_to_program);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// cudnn
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_batchnorm_spatial_persistent);
-DECLARE_bool(cudnn_deterministic);
-DECLARE_bool(cudnn_exhaustive_search);
-DECLARE_bool(conv2d_disable_cudnn);
-// data processing
-DECLARE_bool(enable_cublas_tensor_op_math);
-// device management
-DECLARE_string(selected_gpus);
-// memory management
-DECLARE_bool(eager_delete_scope);
-DECLARE_bool(fast_eager_deletion_mode);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(gpu_memory_limit_mb);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-// others
-DECLARE_bool(sync_nccl_allreduce);
-#endif
-
-#ifdef PADDLE_WITH_XPU
-// device management
-DECLARE_string(selected_xpus);
-#endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-// device management
-DECLARE_string(selected_npus);
-// set minmum loss scaling value
-DECLARE_int32(min_loss_scaling);
-#endif
-
+// FIXME(zengjinle): these 2 flags may be removed by the linker when compiling
+// CPU-only Paddle. It is because they are only used in
+// AutoGrowthBestFitAllocator, but AutoGrowthBestFitAllocator is not used
+// (in the translation unit level) when compiling CPU-only Paddle. I do not
+// want to add PADDLE_FORCE_LINK_FLAG, but I have not found any other methods
+// to solve this problem.
+PADDLE_FORCE_LINK_FLAG(free_idle_chunk);
+PADDLE_FORCE_LINK_FLAG(free_when_no_cache_hit);
+
+// NOTE: where are these 2 flags from?
 #ifdef PADDLE_WITH_DISTRIBUTE
-DECLARE_int32(rpc_send_thread_num);
 DECLARE_int32(rpc_get_thread_num);
 DECLARE_int32(rpc_prefetch_thread_num);
 #endif
@@ -181,7 +116,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
     PADDLE_ENFORCE_NOT_NULL(setter,
                             platform::errors::InvalidArgument(
                                 "Setter of %s should not be null", name));
-
     var_infos_.insert({name, VarInfo(is_public, getter, setter)});
   }
 
@@ -243,81 +177,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
 
 GlobalVarGetterSetterRegistry GlobalVarGetterSetterRegistry::instance_;
 
-class GlobalVarGetterSetterRegistryHelper {
- public:
-  GlobalVarGetterSetterRegistryHelper(bool is_public, bool is_writable,
-                                      const std::string &var_names)
-      : is_public_(is_public),
-        is_writable_(is_writable),
-        var_names_(SplitVarNames(var_names)) {}
-
-  template <typename... Args>
-  void Register(Args &&... args) const {
-    Impl<0, sizeof...(args) == 1, Args...>::Register(
-        is_public_, is_writable_, var_names_, std::forward<Args>(args)...);
-  }
-
- private:
-  static std::vector<std::string> SplitVarNames(const std::string &names) {
-    auto valid_char = [](char ch) { return !std::isspace(ch) && ch != ','; };
-
-    std::vector<std::string> ret;
-    size_t i = 0, j = 0, n = names.size();
-    while (i < n) {
-      for (; i < n && !valid_char(names[i]); ++i) {
-      }
-      for (j = i + 1; j < n && valid_char(names[j]); ++j) {
-      }
-
-      if (i < n && j <= n) {
-        auto substring = names.substr(i, j - i);
-        VLOG(10) << "Get substring: \"" << substring << "\"";
-        ret.emplace_back(substring);
-      }
-      i = j + 1;
-    }
-    return ret;
-  }
-
- private:
-  template <size_t kIdx, bool kIsStop, typename T, typename... Args>
-  struct Impl {
-    static void Register(bool is_public, bool is_writable,
-                         const std::vector<std::string> &var_names, T &&var,
-                         Args &&... args) {
-      PADDLE_ENFORCE_EQ(kIdx + 1 + sizeof...(args), var_names.size(),
-                        platform::errors::InvalidArgument(
-                            "Argument number not match name number"));
-      Impl<kIdx, true, T>::Register(is_public, is_writable, var_names, var);
-      Impl<kIdx + 1, sizeof...(Args) == 1, Args...>::Register(
-          is_public, is_writable, var_names, std::forward<Args>(args)...);
-    }
-  };
-
-  template <size_t kIdx, typename T>
-  struct Impl<kIdx, true, T> {
-    static void Register(bool is_public, bool is_writable,
-                         const std::vector<std::string> &var_names, T &&var) {
-      auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();
-      if (is_writable) {
-        instance->Register(
-            var_names[kIdx], is_public,
-            GlobalVarGetterSetterRegistry::CreateGetter(std::forward<T>(var)),
-            GlobalVarGetterSetterRegistry::CreateSetter(&var));
-      } else {
-        instance->Register(
-            var_names[kIdx], is_public,
-            GlobalVarGetterSetterRegistry::CreateGetter(std::forward<T>(var)));
-      }
-    }
-  };
-
- private:
-  const bool is_public_;
-  const bool is_writable_;
-  const std::vector<std::string> var_names_;
-};
-
 static void RegisterGlobalVarGetterSetter();
 
 void BindGlobalValueGetterSetter(pybind11::module *module) {
@@ -338,65 +197,57 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
 }
 
 /* Public vars are designed to be writable. */
-#define REGISTER_PUBLIC_GLOBAL_VAR(...)                                        \
-  do {                                                                         \
-    GlobalVarGetterSetterRegistryHelper(/*is_public=*/true,                    \
-                                        /*is_writable=*/true, "" #__VA_ARGS__) \
-        .Register(__VA_ARGS__);                                                \
+#define REGISTER_PUBLIC_GLOBAL_VAR(var)                                    \
+  do {                                                                     \
+    auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();     \
+    instance->Register(#var, /*is_public=*/true,                           \
+                       GlobalVarGetterSetterRegistry::CreateGetter(var),   \
+                       GlobalVarGetterSetterRegistry::CreateSetter(&var)); \
   } while (0)
 
-#define REGISTER_PRIVATE_GLOBAL_VAR(is_writable, ...)                     \
-  do {                                                                    \
-    GlobalVarGetterSetterRegistryHelper(/*is_public=*/false, is_writable, \
-                                        "" #__VA_ARGS__)                  \
-        .Register(__VA_ARGS__);                                           \
-  } while (0)
+struct RegisterGetterSetterVisitor : public boost::static_visitor<void> {
+  RegisterGetterSetterVisitor(const std::string &name, bool is_writable,
+                              void *value_ptr)
+      : name_(name), is_writable_(is_writable), value_ptr_(value_ptr) {}
 
-static void RegisterGlobalVarGetterSetter() {
-  REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_free_idle_chunk,
-                              FLAGS_free_when_no_cache_hit);
-
-  REGISTER_PUBLIC_GLOBAL_VAR(
-      FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
-      FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic,
-      FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
-      FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
-      FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
-      FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
-      FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
-      FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
-      FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
-      FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add,
-      FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off,
-      FLAGS_apply_pass_to_program);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  REGISTER_PUBLIC_GLOBAL_VAR(
-      FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic,
-      FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent,
-      FLAGS_cudnn_exhaustive_search, FLAGS_eager_delete_scope,
-      FLAGS_fast_eager_deletion_mode,
-      FLAGS_fraction_of_cuda_pinned_memory_to_use,
-      FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
-      FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
-      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
-      FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch);
-#endif
-#ifdef PADDLE_WITH_XPU
-  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
-#endif
+  template <typename T>
+  void operator()(const T &) const {
+    auto &value = *static_cast<T *>(value_ptr_);
+    auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();
+    bool is_public = is_writable_;  // currently, all writable vars are public
+    if (is_writable_) {
+      instance->Register(name_, is_public,
+                         GlobalVarGetterSetterRegistry::CreateGetter(value),
+                         GlobalVarGetterSetterRegistry::CreateSetter(&value));
+    } else {
+      instance->Register(name_, is_public,
+                         GlobalVarGetterSetterRegistry::CreateGetter(value));
+    }
+  }
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
-  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_min_loss_scaling);
-#endif
+ private:
+  std::string name_;
+  bool is_writable_;
+  void *value_ptr_;
+};
 
+static void RegisterGlobalVarGetterSetter() {
 #ifdef PADDLE_WITH_DITRIBUTE
-  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
-                             FLAGS_rpc_get_thread_num,
-                             FLAGS_rpc_prefetch_thread_num);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_get_thread_num);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_prefetch_thread_num);
 #endif
+
+  const auto &flag_map = platform::GetExportedFlagInfoMap();
+  for (const auto &pair : flag_map) {
+    const std::string &name = pair.second.name;
+    bool is_writable = pair.second.is_writable;
+    void *value_ptr = pair.second.value_ptr;
+    const auto &default_value = pair.second.default_value;
+    RegisterGetterSetterVisitor visitor("FLAGS_" + name, is_writable,
+                                        value_ptr);
+    boost::apply_visitor(visitor, default_value);
+  }
 }
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 62279449e3ca7..5aae05db8cc58 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1947,8 +1947,8 @@ void BindImperative(py::module *m_ptr) {
       .def_property("_enable_program_desc_tracing",
                     &imperative::Tracer::IsProgramDescTracingEnabled,
                     &imperative::Tracer::SetEnableProgramDescTracing)
-      .def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled,
-                    &imperative::Tracer::SetEnableAutoCast)
+      .def_property("_amp_level", &imperative::Tracer::AMPLevel,
+                    &imperative::Tracer::SetAMPLevel)
       .def_property("_has_grad", &imperative::Tracer::HasGrad,
                     &imperative::Tracer::SetHasGrad)
       .def_property(
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b1a91cd302187..8ce7bea2d8e70 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -87,6 +87,7 @@ void BindPaddlePlace(py::module *m);
 void BindPaddlePredictor(py::module *m);
 void BindNativeConfig(py::module *m);
 void BindNativePredictor(py::module *m);
+void BindLiteNNAdapterConfig(py::module *m);
 void BindAnalysisConfig(py::module *m);
 void BindAnalysisPredictor(py::module *m);
 void BindZeroCopyTensor(py::module *m);
@@ -303,6 +304,7 @@ void BindInferenceApi(py::module *m) {
   BindPaddlePredictor(m);
   BindNativeConfig(m);
   BindNativePredictor(m);
+  BindLiteNNAdapterConfig(m);
   BindAnalysisConfig(m);
   BindAnalysisPredictor(m);
   BindPaddleInferPredictor(m);
@@ -322,7 +324,7 @@ void BindInferenceApi(py::module *m) {
                                    auto pred =
                                        std::unique_ptr<paddle_infer::Predictor>(
                                            new paddle_infer::Predictor(config));
-                                   return std::move(pred);
+                                   return pred;
                                  });
   m->def("copy_tensor", &CopyPaddleInferTensor);
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
@@ -624,7 +626,26 @@ void BindAnalysisConfig(py::module *m) {
            [](AnalysisConfig &self) {
              return dynamic_cast<PaddlePassBuilder *>(self.pass_builder());
            },
-           py::return_value_policy::reference);
+           py::return_value_policy::reference)
+      .def("nnadapter", &AnalysisConfig::NNAdapter);
+}
+
+void BindLiteNNAdapterConfig(py::module *m) {
+  py::class_<LiteNNAdapterConfig> lite_nnadapter_config(*m,
+                                                        "LiteNNAdapterConfig");
+
+  lite_nnadapter_config
+      .def("set_device_names", &LiteNNAdapterConfig::SetDeviceNames)
+      .def("set_context_properties", &LiteNNAdapterConfig::SetContextProperties)
+      .def("set_model_cache_dir", &LiteNNAdapterConfig::SetModelCacheDir)
+      .def("set_model_cache_buffers",
+           &LiteNNAdapterConfig::SetModelCacheBuffers)
+      .def("set_subgraph_partition_config_path",
+           &LiteNNAdapterConfig::SetSubgraphPartitionConfigPath)
+      .def("set_subgraph_partition_config_buffer",
+           &LiteNNAdapterConfig::SetSubgraphPartitionConfigBuffer)
+      .def("enable", &LiteNNAdapterConfig::Enable)
+      .def("disable", &LiteNNAdapterConfig::Disable);
 }
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 3da4a4b8e82ab..01d101909b549 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -54,6 +54,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"gather", {"X", "Index", "Axis"}},
     {"roi_pool", {"X", "ROIs", "RoisNum"}},
     {"roi_align", {"X", "ROIs", "RoisNum"}},
+    {"psroi_pool", {"X", "ROIs", "RoisNum"}},
     {"collect_fpn_proposals",
      {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
     {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
@@ -63,11 +64,18 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
     {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
     {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
-    {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
+    {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
     {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
     {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
     {"run_program", {"X", "Params"}},
-    {"matrix_rank", {"X", "TolTensor"}}};
+    {"matrix_rank", {"X", "TolTensor"}},
+    {"adam",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
+    {"adamw",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
+};
 
 // NOTE(zhiqiu): Like op_ins_map.
 // Commonly, the outputs in auto-generated OP function are determined by the
@@ -97,12 +105,18 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"multiclass_nms3", {"Out", "NmsRoisNum"}},
     {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
-    {"momentum", {"ParamOut", "VelocityOut"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"sparse_momentum", {"ParamOut", "VelocityOut"}},
     {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"run_program", {"DOut"}},
+    {"adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"adamw",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -119,13 +133,15 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
 std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"sgd", {"ParamOut"}},
     {"adam",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
     {"adamw",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
     {"average_accumulates",
      {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
       "out_old_num_accumulates", "out_num_updates"}},
-    {"momentum", {"ParamOut", "VelocityOut"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"sparse_momentum", {"ParamOut", "VelocityOut"}},
     {"batch_norm", {"MeanOut", "VarianceOut"}},
     {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5174306d722f4..c00f529f61793 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -38,6 +38,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+#include "paddle/fluid/framework/ir/cost_model.h"
 #include "paddle/fluid/framework/ir/generate_pass.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
@@ -78,6 +79,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
 #endif
+#include "paddle/fluid/pybind/bind_cost_model.h"
 #include "paddle/fluid/pybind/box_helper_py.h"
 #include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -1979,12 +1981,8 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::CostInfo>(m, "CostInfo")
       .def(py::init<>())
       .def("total_time", [](CostInfo &self) { return self.total_time; })
-      .def("host_memory_bytes",
-           [](CostInfo &self) { return self.host_memory_bytes; })
       .def("device_memory_bytes",
-           [](CostInfo &self) { return self.device_memory_bytes; })
-      .def("device_total_memory_bytes",
-           [](CostInfo &self) { return self.device_total_memory_bytes; });
+           [](CostInfo &self) { return self.device_memory_bytes; });
 
   py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
       .def(py::init<const platform::Place &, const ProgramDesc &,
@@ -2135,6 +2133,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindBlockDesc(&m);
   BindVarDsec(&m);
   BindOpDesc(&m);
+  BindCostModel(&m);
   BindConstValue(&m);
   BindGlobalValueGetterSetter(&m);
   BindProcessMeshDesc(&m);
@@ -2443,7 +2442,6 @@ All parameter, weight, gradient are variables in Paddle.
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
   // -- python binds for parallel executor.
-
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
     ExecutionStrategy allows the user to more preciously control how to run
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 9ed1ed30324b2..60b99a964a57f 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -32,9 +32,10 @@
 #include "paddle/fluid/platform/place.h"
 #include "pybind11/stl.h"
 
-DEFINE_bool(reader_queue_speed_test_mode, false,
-            "If set true, the queue.pop will only get data from queue but not "
-            "remove the data from queue for speed testing");
+PADDLE_DEFINE_EXPORTED_bool(
+    reader_queue_speed_test_mode, false,
+    "If set true, the queue.pop will only get data from queue but not "
+    "remove the data from queue for speed testing");
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
index 37b713766dd55..1ab7690f8b517 100644
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -53,7 +53,7 @@ template <class... ARGS>
 std::string format_string(const char* fmt, ARGS&&... args) {
   std::string str;
   format_string_append(str, fmt, args...);
-  return std::move(str);
+  return str;
 }
 
 template <class... ARGS>
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 1cc4927e73960..0283de66ba5af 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -76,6 +76,8 @@ if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
 if not defined retry_times set retry_times=3
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 if not defined BUILD_DIR set BUILD_DIR=build
+set task_name=%1
+set UPLOAD_TP_FILE=OFF
 
 rem ------initialize the python environment------
 set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
@@ -363,17 +365,24 @@ echo echo ${md5_content}^>md5.txt >> cache.sh
 %cache_dir%\tools\busybox64.exe bash cache.sh
 
 set /p md5=< md5.txt
+echo %task_name%|findstr build >nul && (
+    set THIRD_PARTY_HOME=%cache_dir:\=/%/third_party
+    set THIRD_PARTY_PATH=!THIRD_PARTY_HOME!/%md5%
+    echo %task_name% is a whl-build task, will only reuse local third_party cache.
+    goto :cmake_impl
+) || ( 
+    echo %task_name% is a PR-CI-Windows task, will try to reuse bos and local third_party cache both. 
+)
+
 if "%WITH_GPU%"=="ON" (
-    for /F "delims=" %%# in ('nvcc --version^|findstr V1') do set cuda_version=%%#
-    set cuda_version=!cuda_version:~-7,4!
+    for /F %%# in ('dir /b /d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\"') do set cuda_version=%%#
+    set cuda_version=!cuda_version:~-4!
     set sub_dir=cuda!cuda_version:.=!
 ) else (
     set sub_dir=cpu
 )
-
 set THIRD_PARTY_HOME=%cache_dir:\=/%/third_party/%sub_dir%
 set THIRD_PARTY_PATH=%THIRD_PARTY_HOME%/%md5%
-set UPLOAD_TP_FILE=OFF
 
 if not exist %THIRD_PARTY_PATH% (
     echo There is no usable third_party cache in %THIRD_PARTY_PATH%, will download from bos.
@@ -397,7 +406,7 @@ if not exist %THIRD_PARTY_PATH% (
     if not exist %THIRD_PARTY_PATH% ( set UPLOAD_TP_FILE=ON ) 
     cd %work_dir%\%BUILD_DIR%
 ) else (
-    echo Found reusable third_party cache locally, will reuse it.
+    echo Found reusable third_party cache in %THIRD_PARTY_PATH%, will reuse it.
 )
 
 :cmake_impl
@@ -512,8 +521,8 @@ if %ERRORLEVEL% NEQ 0 (
     )
 )
 
-set BCE_FILE=%cache_dir%\bce-python-sdk-0.8.33\BosClient.py
-if %UPLOAD_TP_FILE%==ON (
+if "%UPLOAD_TP_FILE%"=="ON" (
+    set BCE_FILE=%cache_dir%\bce-python-sdk-0.8.33\BosClient.py
     echo Uploading third_party: checking bce ...
     if not exist %cache_dir%\bce-python-sdk-0.8.33 (
         echo There is no bce in this PC, will install bce.
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a548bb304f436..0c2580929081d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -223,7 +223,7 @@ function cmake_base() {
         -DWITH_GLOO=${gloo_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
         -DWITH_XPU=${WITH_XPU:-OFF}
-        -DLITE_GIT_TAG=release/v2.8
+        -DLITE_GIT_TAG=release/v2.10
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
         -DWITH_ARM=${WITH_ARM:-OFF}
@@ -266,7 +266,7 @@ EOF
         -DWITH_PSCORE=${distibuted_flag} \
         -DWITH_PSLIB=${WITH_PSLIB:-OFF} \
         -DWITH_GLOO=${gloo_flag} \
-        -DLITE_GIT_TAG=release/v2.8 \
+        -DLITE_GIT_TAG=release/v2.10 \
         -DWITH_XPU=${WITH_XPU:-OFF} \
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
@@ -1161,8 +1161,8 @@ function parallel_test_base_gpu() {
 EOF
 
 set -x
-        # set trt_convert ut to run 30% cases.
-        export TEST_NUM_PERCENT_CASES=0.3
+        # set trt_convert ut to run 15% cases.
+        export TEST_NUM_PERCENT_CASES=0.15
         precison_cases=""
         bash $PADDLE_ROOT/tools/check_added_ut.sh
         if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 6feef11a366d9..e94805be5a147 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/npu_info.h"
 
@@ -22,13 +23,11 @@ int main(int argc, char** argv) {
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
   testing::InitGoogleTest(&argc, argv);
   std::vector<char*> new_argv;
-  std::string gflags_env;
   for (int i = 0; i < argc; ++i) {
     new_argv.push_back(argv[i]);
   }
 
   std::vector<std::string> envs;
-  std::vector<std::string> undefok;
 #if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB)
   std::string str_max_body_size;
   if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size",
@@ -38,35 +37,13 @@ int main(int argc, char** argv) {
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_ASCEND_CL)
-  envs.push_back("fraction_of_gpu_memory_to_use");
-  envs.push_back("initial_gpu_memory_in_mb");
-  envs.push_back("reallocate_gpu_memory_in_mb");
-  envs.push_back("allocator_strategy");
-  envs.push_back("selected_gpus");
-#elif __clang__
-  envs.push_back("use_mkldnn");
-  envs.push_back("initial_cpu_memory_in_mb");
-  envs.push_back("allocator_strategy");
-
-  undefok.push_back("use_mkldnn");
-  undefok.push_back("initial_cpu_memory_in_mb");
-#else
-  envs.push_back("use_pinned_memory");
-  envs.push_back("use_mkldnn");
-  envs.push_back("initial_cpu_memory_in_mb");
-  envs.push_back("allocator_strategy");
-
-  undefok.push_back("use_pinned_memory");
-  undefok.push_back("use_mkldnn");
-  undefok.push_back("initial_cpu_memory_in_mb");
-#endif
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-  envs.push_back("selected_npus");
-  envs.push_back("npu_config_path");
-#endif
+  const auto& flag_map = paddle::platform::GetExportedFlagInfoMap();
+  for (const auto& pair : flag_map) {
+    const std::string& name = pair.second.name;
+    if (pair.second.is_writable) {  // means public
+      envs.push_back(name);
+    }
+  }
 
   char* env_str = nullptr;
   if (envs.size() > 0) {
@@ -80,18 +57,6 @@ int main(int argc, char** argv) {
     VLOG(1) << "gtest env_string:" << env_string;
   }
 
-  char* undefok_str = nullptr;
-  if (undefok.size() > 0) {
-    std::string undefok_string = "--undefok=";
-    for (auto t : undefok) {
-      undefok_string += t + ",";
-    }
-    undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
-    undefok_str = strdup(undefok_string.c_str());
-    new_argv.push_back(undefok_str);
-    VLOG(1) << "gtest undefok_string:" << undefok_string;
-  }
-
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   ::GFLAGS_NAMESPACE::ParseCommandLineFlags(
@@ -105,7 +70,5 @@ int main(int argc, char** argv) {
 #endif
 
   if (env_str) free(env_str);
-  if (undefok_str) free(undefok_str);
-
   return ret;
 }
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index a5f78a9f31a90..b7a601f53fd85 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -88,14 +88,14 @@ IF(WIN32)
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
     COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
+    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto pass_desc_py_proto ${PY_FILES})
 ELSE(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
+    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto pass_desc_py_proto ${PY_FILES})
 ENDIF()
 
 add_custom_target(paddle_python ALL DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ac9c60b4f5708..024415664d8a6 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -64,6 +64,7 @@
 import paddle.static  # noqa: F401
 import paddle.vision  # noqa: F401
 
+from .tensor import fft
 from .tensor.random import bernoulli  # noqa: F401
 
 from .tensor.attribute import rank  # noqa: F401
@@ -100,10 +101,11 @@
 from .tensor.linalg import bmm  # noqa: F401
 from .tensor.linalg import histogram  # noqa: F401
 from .tensor.linalg import mv  # noqa: F401
-from .tensor.linalg import multi_dot  # noqa: F401
+from .tensor.linalg import det  # noqa: F401
+from .tensor.linalg import slogdet  # noqa: F401
 from .tensor.linalg import matrix_power  # noqa: F401
 from .tensor.linalg import svd  # noqa: F401
-from .tensor.linalg import eigh  # noqa: F401
+from .tensor.linalg import solve  # noqa: F401
 from .tensor.logic import equal  # noqa: F401
 from .tensor.logic import greater_equal  # noqa: F401
 from .tensor.logic import greater_than  # noqa: F401
diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py
index 64992752b2e8d..381aad8850bc1 100644
--- a/python/paddle/amp/__init__.py
+++ b/python/paddle/amp/__init__.py
@@ -14,5 +14,6 @@
 
 from .auto_cast import auto_cast  # noqa: F401
 from .grad_scaler import GradScaler  # noqa: F401
+from .auto_cast import decorate  # noqa: F401
 
-__all__ = ['auto_cast', 'GradScaler']
+__all__ = ['auto_cast', 'GradScaler', 'decorate']
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 974f718c2d4e2..9d4b84c5047e3 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -13,18 +13,22 @@
 # limitations under the License.
 
 from paddle.fluid.dygraph.amp import amp_guard
+from paddle.fluid.dygraph.amp import amp_decorate
 
 __all__ = []
 
 
-def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
+def auto_cast(enable=True,
+              custom_white_list=None,
+              custom_black_list=None,
+              level='O1'):
     """
     Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
     If enabled, the input data type (float32 or float16) of each operator is decided 
     by autocast algorithm for better performance. 
     
     Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in 
-    imperative mode.
+    imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
 
     Args:
         enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
@@ -34,6 +38,8 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
         custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
              calculation and are considered numerically-dangerous and whose effects may also be 
              observed in downstream ops. These ops will not be converted to fp16.
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; 
+             O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
         
     Examples:
 
@@ -61,6 +67,67 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
         with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
             c = a + b
             print(c.dtype) # FP16
+        
+        with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
+            d = a + b
+            print(d.dtype) # FP16
+
+    """
+    return amp_guard(enable, custom_white_list, custom_black_list, level)
+
+
+def decorate(models,
+             optimizers=None,
+             level='O1',
+             master_weight=None,
+             save_dtype=None):
+    """
+    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing. 
+    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
+    
+    Commonly, it is used together with `auto_cast` to achieve Pure fp16 in imperative mode.
+
+    Args:
+        models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
+        optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing; 
+             O2 represent Pure fp16, the decorator will cast all parameters of models to FP16, except BatchNorm and LayerNorm. Default is O1(amp)
+        master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
+        save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, float32, float64 or None.
+             The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
+
+    Examples:
+
+     .. code-block:: python   
+
+        # required: gpu
+        # Demo1: single model and optimizer:
+        import paddle
+
+        model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
+        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+
+        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2')
+
+        data = paddle.rand([10, 3, 32, 32])
+
+        with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
+            output = model(data)
+            print(output.dtype) # FP16
+            
+        # required: gpu
+        # Demo2: multi models and optimizers:
+        model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
+        optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
+
+        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+
+        data = paddle.rand([10, 3, 32, 32])
 
+        with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
+            output = models[0](data)
+            output2 = models[1](data)
+            print(output.dtype) # FP16
+            print(output2.dtype) # FP16
     """
-    return amp_guard(enable, custom_white_list, custom_black_list)
+    return amp_decorate(models, optimizers, level, master_weight, save_dtype)
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 5c3b575f2f069..83f57fc74e89a 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -13,18 +13,28 @@
 # limitations under the License.
 
 from paddle.fluid.dygraph.amp import AmpScaler
+from paddle.fluid.dygraph.amp import OptimizerState
+from collections import defaultdict
 
 __all__ = []
 
 
+def _refresh_optimizer_state():
+    return {"state": OptimizerState.INIT}
+
+
 class GradScaler(AmpScaler):
     """
     GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode. 
     It controls the scaling of loss, helps avoiding numerical overflow.
-    The object of this class has two methods `scale()`, `minimize()`.
+    The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters.
 
     `scale()` is used to multiply the loss by a scale ratio.
-    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating.
+    `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
+    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`.
+    `step()` is similar as `optimizer.step()`, which performs parameters updating.
+    `update` is used to update the loss_scaling.
+
 
     Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in 
     dynamic graph mode.
@@ -115,7 +125,7 @@ def minimize(self, optimizer, *args, **kwargs):
         This function is similar as `optimizer.minimize()`, which performs parameters updating.
         
         If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
-        Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
+        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.
 
         Finally, the loss scaling ratio is updated.
 
@@ -151,16 +161,18 @@ def step(self, optimizer):
         This function is similar as `optimizer.step()`, which performs parameters updating.
         
         If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
-        Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
+        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.
 
         Args:
             optimizer(Optimizer):  The optimizer used to update parameters.
 
         Examples:
+
             .. code-block:: python
             
                 # required: gpu
                 import paddle
+
                 model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                 optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                 scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
@@ -170,14 +182,21 @@ def step(self, optimizer):
                     loss = paddle.mean(conv)
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
-                scaler.step(optimizer)
+                scaler.step(optimizer)       # update parameters
+                scaler.update()              # update the loss scaling ratio
                 optimizer.clear_grad()
         """
         if not self._enable:
             return optimizer.step()
 
+        optimizer_state = self._optimizer_states[id(optimizer)]
+        if optimizer_state["state"] is OptimizerState.STEPPED:
+            raise RuntimeError(
+                "step() has already been called since the last update().")
+
         #  unscale the grad
-        self._unscale(optimizer)
+        if optimizer_state["state"] is OptimizerState.INIT:
+            self._unscale(optimizer)
 
         if self._found_inf:
             self._cache_founf_inf = True
@@ -185,9 +204,75 @@ def step(self, optimizer):
             optimizer.step()
             self._cache_founf_inf = False
 
+        optimizer_state["state"] = OptimizerState.STEPPED
+
+        if not self._use_dynamic_loss_scaling:
+            self._optimizer_states = defaultdict(_refresh_optimizer_state)
+
+    def update(self):
+        """
+        Updates the loss_scaling.
+        
+        Examples:
+
+            .. code-block:: python
+            
+                # required: gpu
+                import paddle
+
+                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+                data = paddle.rand([10, 3, 32, 32])
+                with paddle.amp.auto_cast():
+                    conv = model(data)
+                    loss = paddle.mean(conv)
+                scaled = scaler.scale(loss)     # scale the loss 
+                scaled.backward()               # do backward
+                scaler.step(optimizer)          # update parameters
+                scaler.update()                 # update the loss scaling ratio
+                optimizer.clear_grad() 
+        """
+        if not self._enable:
+            return
         if self._use_dynamic_loss_scaling:
-            # uopdate the scale
             self._update()
+            self._optimizer_states = defaultdict(_refresh_optimizer_state)
+        return
+
+    def unscale_(self, optimizer):
+        """
+        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).  
+        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
+
+        Args:
+            optimizer(Optimizer):  The optimizer used to update parameters.
+
+        Returns:
+            The unscaled parameters or original parameters.
+        
+        Examples:
+
+            .. code-block:: python
+
+                # required: gpu
+                import paddle
+
+                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+                data = paddle.rand([10, 3, 32, 32])
+                with paddle.amp.auto_cast():
+                    conv = model(data)
+                    loss = paddle.mean(conv)
+                scaled = scaler.scale(loss)  # scale the loss 
+                scaled.backward()            # do backward
+                scaler.unscale_(optimizer)    # unscale the parameter
+                scaler.step(optimizer)
+                scaler.update()  
+                optimizer.clear_grad() 
+        """
+        return super(GradScaler, self)._unscale(optimizer)
 
     def is_enable(self):
         """
diff --git a/python/paddle/cost_model/cost_model.py b/python/paddle/cost_model/cost_model.py
new file mode 100644
index 0000000000000..93c89d0c89297
--- /dev/null
+++ b/python/paddle/cost_model/cost_model.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.static as static
+import numpy as np
+from paddle.fluid import core
+
+
+class CostModel():
+    def __init__(self):
+        pass
+
+    def build_program(self):
+        paddle.enable_static()
+
+        main_program = static.Program()
+        startup_program = static.Program()
+        with static.program_guard(
+                main_program=main_program, startup_program=startup_program):
+            data = paddle.static.data(
+                name='X', shape=[None, 1], dtype='float32')
+            hidden = paddle.static.nn.fc(data, 10)
+            loss = paddle.mean(hidden)
+            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+        print("main program is: {}".format(main_program))
+        #print("start up program is: {}".format(startup_program))
+
+        return startup_program, main_program
+
+    def profile_measure(self,
+                        startup_program,
+                        main_program,
+                        device='gpu',
+                        fetch_cost_list=['time', 'memory']):
+
+        place = paddle.set_device('gpu')
+        x = np.random.random(size=(10, 1)).astype('float32')
+        exe = paddle.static.Executor(place)
+
+        exe.run(startup_program)
+        paddle.fluid.profiler.start_profiler("All")
+        exe.run(main_program, feed={"X": x}, fetch_list=[])
+        # core.CostModel.ProfileMeasure(main_program, device)
+        print("core:<<<<<<<")
+
+        cost_model = core.CostModel()
+        cost_data = cost_model.ProfileMeasure(device)
+        # cost_list = self.stop_cost_model()
+        # return cost_list
+
+
+cost_model = CostModel()
+
+startup_program, main_program = cost_model.build_program()
+
+cost_model.profile_measure(startup_program, main_program)
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 4934f2093f3fb..20007f76ed5e4 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .spawn import spawn  # noqa: F401
-from paddle.distributed.fleet.launch import launch  # noqa: F401
+from .fleet.launch import launch  # noqa: F401
 
 from .parallel import init_parallel_env  # noqa: F401
 from .parallel import get_rank  # noqa: F401
diff --git a/python/paddle/distributed/fleet/elastic/collective.py b/python/paddle/distributed/fleet/elastic/collective.py
index 94fe6a54b5809..83f0e85db2bad 100644
--- a/python/paddle/distributed/fleet/elastic/collective.py
+++ b/python/paddle/distributed/fleet/elastic/collective.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 from paddle.distributed.fleet import launch_utils
-import paddle.distributed.fleet.cloud_utils as cloud_utils
-import paddle.distributed.fleet.ascend_utils as ascend_utils
+from paddle.distributed.fleet import cloud_utils
+from paddle.distributed.fleet import ascend_utils
 
 from paddle.distributed.fleet.launch_utils import *
 
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 2920dd5870ac1..c0a1c359d17c6 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -72,8 +72,8 @@
 
 # TODO(danleifeng): Don't import * from a module
 from paddle.distributed.fleet.launch_utils import *
-import paddle.distributed.fleet.cloud_utils as cloud_utils
-import paddle.distributed.fleet.ascend_utils as ascend_utils
+from paddle.distributed.fleet import cloud_utils
+from paddle.distributed.fleet import ascend_utils
 
 from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic
 
@@ -400,33 +400,33 @@ def launch():
 
 
     Base Parameters:
-        - ``--log_dir``: The path for each process's log. e.g ``--log_dir=output_dir``. Default ``--log_dir=log``.
+        - ``--log_dir``: The path for each process's log. e.g., ``--log_dir=output_dir``. Default ``--log_dir=log``.
 
-        - ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system(or you set by --gpus). And so each process can bound to one or average number of gpus. e.g ``--nproc_per_node=8``
+        - ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system(or you set by --gpus).  e.g., ``--nproc_per_node=8``
 
-        - ``--run_mode``: run mode of job, can be:collective/ps/ps-heter. e.g ``--run_mode=ps``. Default ``--run_mode=collective``.
+        - ``--run_mode``: run mode of job, can be:collective/ps/ps-heter. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.
 
-        - ``--gpus``: It's for gpu training. e.g ``--gpus=0,1,2,3`` will launch four training processes each bound to one gpu.
+        - ``--gpus``: It's for gpu training. e.g., ``--gpus=0,1,2,3`` will launch four training processes each bound to one gpu.
 
         - ``--selected_gpus``: gpus aliases, recommend to use ``--gpus``.
         
-        - ``--xpus``: It's for xpu training if xpu is available. e.g ``--xpus=0,1,2,3``.
+        - ``--xpus``: It's for xpu training if xpu is available. e.g., ``--xpus=0,1,2,3``.
         
         - ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``.
 
-        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g ``traing.py``
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
 
-        - ``training_script_args``: The args of training_script. e.g ``--lr=0.1``
+        - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
 
     Collective Parameters:
-        - ``--ips``: Paddle cluster nodes ips, e.g ``--ips=192.168.0.16,192.168.0.17``. Default ``--ips=127.0.0.1``.
+        - ``--ips``: Paddle cluster nodes ips, e.g., ``--ips=192.168.0.16,192.168.0.17``. Default ``--ips=127.0.0.1``.
 
     Parameter-Server Parameters:
-        - ``--servers``: User defined servers ip:port, e.g ``--servers="192.168.0.16:6170,192.168.0.17:6170"``
+        - ``--servers``: User defined servers ip:port, e.g., ``--servers="192.168.0.16:6170,192.168.0.17:6170"``
 
-        - ``--workers``: User defined workers ip:port, e.g ``--workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172"``
+        - ``--workers``: User defined workers ip:port, e.g., ``--workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172"``
 
-        - ``--heter_workers``: User defined heter workers ip:port, e.g ``--heter_workers="192.168.0.16:6172,192.168.0.17:6172"``
+        - ``--heter_workers``: User defined heter workers ip:port, e.g., ``--heter_workers="192.168.0.16:6172,192.168.0.17:6172"``
 
         - ``--worker_num``: Number of workers (It recommend to set when in the emulated distributed environment using single node)
 
@@ -437,17 +437,14 @@ def launch():
         - ``--http_port``: Gloo http Port
 
     Elastic Parameters:
-        - ``--elastic_server``: etcd server host:port, e.g ``--elastic_server=127.0.0.1:2379``
+        - ``--elastic_server``: etcd server host:port, e.g., ``--elastic_server=127.0.0.1:2379``
 
-        - ``--job_id``: job unique id, e.g ``--job_id=job1``
+        - ``--job_id``: job unique id, e.g., ``--job_id=job1``
 
-        - ``--np``: job pod/node number, e.g ``--np=2``
-
-        - ``--scale``: scale np, not be used now!
+        - ``--np``: job pod/node number, e.g., ``--np=2``
 
         - ``--host``: bind host, default to POD_IP env.
 
-        - ``--force``: update np force, not be used now!
 
     Returns:
         ``None``
@@ -456,7 +453,7 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash1
             
-            # For single node training using 4 gpus
+            # For training on single node using 4 gpus.
 
             python -m paddle.distributed.launch --gpus=0,1,2,3 train.py --lr=0.01
         
@@ -464,7 +461,9 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash2
 
-            # For multiple node training such as two node:192.168.0.16, 192.168.0.17
+            # The parameters of --gpus and --ips must be consistent in each node.
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 
 
             # On 192.168.0.16:
 
@@ -477,7 +476,7 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash3
 
-            # The emulated distributed environment using single node, 2 server and 4 worker
+            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
             
             python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
         
@@ -485,7 +484,7 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash4
 
-            # For multiple node training such as two node:192.168.0.16, 192.168.0.17 with 2 servers and total 4 workers
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
 
             # On 192.168.0.16:
 
@@ -499,7 +498,7 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash5
 
-            # The emulated distributed environment using single node, 2 server and 4 worker, each worker use single gpu
+           # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
             
             export CUDA_VISIBLE_DEVICES=0,1,2,3
             python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
@@ -508,7 +507,7 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash6
 
-            # For multiple node training such as two node:192.168.0.16, 192.168.0.17 with 2 servers and total 4 workers
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
 
             # On 192.168.0.16:
 
@@ -524,7 +523,7 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash7
 
-            # The emulated distributed environment using single node, 2 server and 4 worker, two worker use gpu, two worker use cpu
+            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
             
             export CUDA_VISIBLE_DEVICES=0,1
             python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
@@ -533,7 +532,7 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash8
 
-            # For multiple node training such as two node:192.168.0.16, 192.168.0.17 with 2 servers and total 4 workers
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server, 1 gpu worker, 1 cpu worker.
 
             # On 192.168.0.16:
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
index 8aee34960332a..3816e9b3051ab 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -213,6 +213,7 @@ def remove_param(input_name):
 
                 if out_name in param_name_to_offload_name:
                     var_name = out_name
+                    # FIXME(wangxi): offload should insert after broadcast param
                     if offload:
                         offload_var_name = param_name_to_offload_name[var_name]
                         self._insert_offload_op(startup_block, idx + 1,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index f14f1e0662402..1af646b3959e0 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -1380,10 +1380,18 @@ def _initialization_broadcast(self):
             return
 
         startup_block = self._startup_program.global_block()
-
-        params = []
-        for param in startup_block.iter_parameters():
-            params.append(param)
+        params = startup_block.all_parameters()
+
+        broadcast_params = []
+        for param in params:
+            broadcast_params.append(param)
+            # optimize_cast need broadcast fp16 param
+            fp16_param_name = param.name + '.cast_fp16'
+            if startup_block.has_var(fp16_param_name):
+                fp16_param = startup_block.var(fp16_param_name)
+                broadcast_params.append(fp16_param)
+
+        for param in broadcast_params:
             startup_block.append_op(
                 type='c_broadcast',
                 inputs={'X': param},
@@ -1395,8 +1403,8 @@ def _initialization_broadcast(self):
                 })
         startup_block.append_op(
             type='c_sync_comm_stream',
-            inputs={'X': params},
-            outputs={'Out': params},
+            inputs={'X': broadcast_params},
+            outputs={'Out': broadcast_params},
             attrs={'ring_id': self.dp_ring_id,
                    OP_ROLE_KEY: OpRole.Forward})
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 8fad0686dd42e..431bc6d7bc389 100755
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -329,6 +329,7 @@ def _broadcast_final_loss(self):
     def _optimizer_step(self):
         if self.scaler:
             self.scaler.step(self.optimizer)
+            self.scaler.update()
         else:
             self.optimizer.step()
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 598c4b26423fd..b29b0b3e27557 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -198,7 +198,11 @@ def forward(ctx, run_function, all_outputs, *args):
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
-        ctx.is_fw_autocast = tracer._enable_autocast
+        if tracer._amp_level == 0:
+            ctx.is_fw_autocast = False
+        else:
+            ctx.is_fw_autocast = True
+        ctx.amp_mode = 'O1'
         ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
 
         with paddle.no_grad():
@@ -258,7 +262,8 @@ def backward(ctx, *args):
                 with paddle.amp.auto_cast(
                         enable=ctx.is_fw_autocast,
                         custom_white_list=ctx.amp_white_list,
-                        custom_black_list=ctx.amp_black_list):
+                        custom_black_list=ctx.amp_black_list,
+                        level=ctx.amp_mode):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
 
diff --git a/python/paddle/distributed/fleet/utils/ps_util.py b/python/paddle/distributed/fleet/utils/ps_util.py
index 8bf69a41a7cc8..ba6fd54a60a5e 100644
--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
@@ -128,9 +128,113 @@ def _get_pull_sparse_ops(_program):
                 return pull_sparse_ops
 
             def _pull_sparse_fuse(_program, pull_sparse_ops):
+                def dag_check_up_and_reorder(program, inputs, outputs):
+                    global_block = program.global_block()
+                    min_output_index = len(global_block.ops)
+                    max_input_index = -1
+                    input_indexes = [0] * len(global_block.ops)
+                    output_indexes = [0] * len(global_block.ops)
+                    for idx, op in enumerate(global_block.ops):
+                        for i in range(0, len(op.output_names)):
+                            if input_indexes[idx] == 1:
+                                break
+                            outs = op.output(op.output_names[i])
+                            for in_id, in_var in enumerate(inputs):
+                                if in_var.name in outs:
+                                    input_indexes[idx] = 1
+                                    max_input_index = max(max_input_index, idx)
+                                    break
+
+                        for i in range(0, len(op.input_names)):
+                            if output_indexes[idx] == 1:
+                                break
+                            ins = op.input(op.input_names[i])
+                            for out_id, out_var in enumerate(outputs):
+                                if out_var.name in ins:
+                                    output_indexes[idx] = 1
+                                    min_output_index = min(min_output_index,
+                                                           idx)
+
+                    for i in range(len(global_block.ops)):
+                        if input_indexes[i] == 1 and output_indexes[i] == 1:
+                            warnings.warn(
+                                "unable to re-arrange dags order to combine distributed embedding ops because a op both needs embedding table's output as input and produces ids as the same embedding table's input"
+                            )
+                            return
+
+                    if min_output_index < max_input_index:
+                        move_ops = []
+                        for i in range(min_output_index + 1,
+                                       len(input_indexes)):
+                            if input_indexes[i] == 1:
+                                move_ops.append((global_block.ops[i], i))
+                        for i, op in enumerate(move_ops):
+                            queue = list()
+                            visited = set()
+                            queue.append(op[1])
+                            visited.add(op[0])
+                            start = 0
+                            while start < len(queue):
+                                pos = queue[start]
+                                op = global_block.ops[pos]
+                                op_inputs = []
+                                for k in range(0, len(op.input_names)):
+                                    ins = op.input(op.input_names[k])
+                                    op_inputs.append(ins)
+                                for j in range(pos - 1, min_output_index - 1,
+                                               -1):
+                                    op1 = global_block.ops[j]
+                                    if op1 in visited:
+                                        continue
+                                    found = False
+                                    for k in range(0, len(op1.output_names)):
+                                        outs = op1.output(op1.output_names[k])
+                                        for t in range(len(op_inputs)):
+                                            for y in op_inputs[t]:
+                                                if y in outs:
+                                                    found = True
+                                                    break
+                                            if found:
+                                                break
+                                        if found:
+                                            break
+                                    if found:
+                                        if output_indexes[j] == True:
+                                            warnings.warn(
+                                                "unable to re-arrange dags order to combine distributed embedding ops"
+                                            )
+                                            return
+                                        queue.append(j)
+                                        visited.add(global_block.ops[j])
+                                start = start + 1
+
+                            queue.sort()
+                            for index in queue:
+                                desc = global_block.desc._insert_op(
+                                    min_output_index)
+                                desc.copy_from(global_block.ops[index].desc)
+                                global_block.desc._remove_op(index + 1,
+                                                             index + 2)
+                                global_block.ops[index].desc = desc
+                                insert_op = global_block.ops.pop(index)
+                                input_state = input_indexes.pop(index)
+                                output_state = output_indexes.pop(index)
+                                global_block.ops.insert(min_output_index,
+                                                        insert_op)
+                                input_indexes.insert(min_output_index,
+                                                     input_state)
+                                output_indexes.insert(min_output_index,
+                                                      output_state)
+                                min_output_index = min_output_index + 1
+
+                        assert global_block.desc.op_size() == len(
+                            global_block.ops)
+                        for i in range(len(global_block.ops)):
+                            assert global_block.desc.op(i) == global_block.ops[
+                                i].desc
+
                 for param, ops in pull_sparse_ops.items():
                     all_ops = program.global_block().ops
-                    op_idxs = [all_ops.index(op) for op in ops]
 
                     inputs = [
                         program.global_block().vars[op.input("Ids")[0]]
@@ -155,23 +259,29 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
                         for op in ops
                     ]
 
+                    dag_check_up_and_reorder(program, inputs, outputs)
+                    op_idxs = [all_ops.index(op) for op in ops]
+
                     for idx in op_idxs[::-1]:
                         program.global_block()._remove_op(idx)
 
                     inputs_idxs = [-1] * len(inputs)
-                    outputs_idxs = [-1] * len(outputs)
+                    outputs_idxs = [len(program.global_block().ops) + 1] * len(
+                        outputs)
 
                     for idx, op in enumerate(program.global_block().ops):
                         for i in range(0, len(op.output_names)):
                             outs = op.output(op.output_names[i])
                             for in_id, in_var in enumerate(inputs):
                                 if in_var.name in outs:
-                                    inputs_idxs[in_id] = idx
+                                    inputs_idxs[in_id] = max(idx,
+                                                             inputs_idxs[in_id])
                         for i in range(0, len(op.input_names)):
                             ins = op.input(op.input_names[i])
                             for out_id, out_var in enumerate(outputs):
                                 if out_var.name in ins:
-                                    outputs_idxs[out_id] = idx
+                                    outputs_idxs[out_id] = min(
+                                        idx, outputs_idxs[out_id])
 
                     if min(outputs_idxs) - max(inputs_idxs) >= 1:
                         distributed_idx = max(inputs_idxs) + 1
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index 89b14258c195c..302877e51fe01 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -98,7 +98,11 @@ def forward(ctx, run_function, preserve_rng_state, *args):
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
-        ctx.is_fw_autocast = tracer._enable_autocast
+        if tracer._amp_level == 0:
+            ctx.is_fw_autocast = False
+        else:
+            ctx.is_fw_autocast = True
+        ctx.amp_mode = 'O1'
         ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
 
         with paddle.no_grad():
@@ -128,14 +132,16 @@ def backward(ctx, *args):
                     with paddle.amp.auto_cast(
                             enable=ctx.is_fw_autocast,
                             custom_white_list=ctx.amp_white_list,
-                            custom_black_list=ctx.amp_black_list):
+                            custom_black_list=ctx.amp_black_list,
+                            level=ctx.amp_mode):
                         detached_inputs = detach_variable(tuple(inputs))
                         outputs = ctx.run_function(*detached_inputs)
             else:
                 with paddle.amp.auto_cast(
                         enable=ctx.is_fw_autocast,
                         custom_white_list=ctx.amp_white_list,
-                        custom_black_list=ctx.amp_black_list):
+                        custom_black_list=ctx.amp_black_list,
+                        level=ctx.amp_mode):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 30e3f9dec973c..8ddac967e7bd3 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -176,83 +176,23 @@ def __bootstrap__():
         print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
 
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
-    sysstr = platform.system()
+
+    flag_prefix = "FLAGS_"
     read_env_flags = [
-        'check_nan_inf',
-        'convert_all_blocks',
-        'benchmark',
-        'eager_delete_scope',
-        'fraction_of_cpu_memory_to_use',
-        'initial_cpu_memory_in_mb',
-        'init_allocated_mem',
-        'paddle_num_threads',
-        'dist_threadpool_size',
-        'eager_delete_tensor_gb',
-        'fast_eager_deletion_mode',
-        'memory_fraction_of_eager_deletion',
-        'allocator_strategy',
-        'reader_queue_speed_test_mode',
-        'print_sub_graph_dir',
-        'pe_profile_fname',
-        'inner_op_parallelism',
-        'enable_parallel_graph',
-        'fuse_parameter_groups_size',
-        'multiple_of_cupti_buffer_size',
-        'fuse_parameter_memory_size',
-        'tracer_profile_fname',
-        'dygraph_debug',
-        'use_system_allocator',
-        'enable_unused_var_check',
-        'free_idle_chunk',
-        'free_when_no_cache_hit',
-        'call_stack_level',
-        'sort_sum_gradient',
-        'max_inplace_grad_add',
-        'apply_pass_to_program',
-        'new_executor_use_inplace',
+        key[len(flag_prefix):] for key in core.globals().keys()
+        if key.startswith(flag_prefix)
     ]
-    if 'Darwin' not in sysstr:
-        read_env_flags.append('use_pinned_memory')
 
-    if os.name != 'nt':
-        read_env_flags.append('cpu_deterministic')
+    def remove_flag_if_exists(name):
+        if name in read_env_flags:
+            read_env_flags.remove(name)
 
-    if core.is_compiled_with_mkldnn():
-        read_env_flags.append('use_mkldnn')
-        read_env_flags.append('tracer_mkldnn_ops_on')
-        read_env_flags.append('tracer_mkldnn_ops_off')
-
-    if core.is_compiled_with_cuda():
-        read_env_flags += [
-            'fraction_of_gpu_memory_to_use',
-            'initial_gpu_memory_in_mb',
-            'reallocate_gpu_memory_in_mb',
-            'cudnn_deterministic',
-            'enable_cublas_tensor_op_math',
-            'conv_workspace_size_limit',
-            'cudnn_exhaustive_search',
-            'selected_gpus',
-            'sync_nccl_allreduce',
-            'cudnn_batchnorm_spatial_persistent',
-            'gpu_allocator_retry_time',
-            'local_exe_sub_scope_limit',
-            'gpu_memory_limit_mb',
-            'conv2d_disable_cudnn',
-            'get_host_by_name_time',
-        ]
+    sysstr = platform.system()
+    if 'Darwin' in sysstr:
+        remove_flag_if_exists('use_pinned_memory')
 
-    if core.is_compiled_with_npu():
-        read_env_flags += [
-            'selected_npus',
-            'fraction_of_gpu_memory_to_use',
-            'initial_gpu_memory_in_mb',
-            'reallocate_gpu_memory_in_mb',
-            'gpu_memory_limit_mb',
-            'npu_config_path',
-            'get_host_by_name_time',
-            'hccl_check_nan',
-            'min_loss_scaling',
-        ]
+    if os.name == 'nt':
+        remove_flag_if_exists('cpu_deterministic')
 
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     # Note(zhouwei25): sys may not have argv in some cases, 
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
index 32c8a1c3544c2..4189abda0588f 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
@@ -42,6 +42,8 @@ class OptimizerWithMixedPrecision(object):
 
     def __init__(self, optimizer, amp_lists, use_pure_bf16, use_bf16_guard):
         self._optimizer = optimizer
+        if optimizer.type == 'sgd':
+            optimizer._use_mkldnn = True
         self._amp_lists = amp_lists
         self._param_grads = None
         self._train_program = None
diff --git a/python/paddle/fluid/contrib/optimizer.py b/python/paddle/fluid/contrib/optimizer.py
index 7f742adb41e6f..3fb808a88a864 100644
--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@@ -203,19 +203,21 @@ def _append_optimize_op(self, block, param_and_grad):
                                              param_and_grad[0])
         lr = self._create_param_lr(param_and_grad)
 
-        if framework.in_dygraph_mode():
-            _, _ = _C_ops.momentum(
-                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
-                param_and_grad[0], velocity_acc, 'mu', self._momentum,
-                'use_nesterov', self._use_nesterov, 'regularization_method',
-                self._regularization_method, 'regularization_coeff',
-                self._regularization_coeff)
-            return None
-
         find_master = self._multi_precision and param_and_grad[
             0].dtype == core.VarDesc.VarType.FP16
         master_weight = (self._master_weights[param_and_grad[0].name]
                          if find_master else None)
+
+        if framework.in_dygraph_mode():
+            _, _, _ = _C_ops.momentum(
+                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
+                master_weight, param_and_grad[0], velocity_acc, master_weight,
+                'mu', self._momentum, 'use_nesterov', self._use_nesterov,
+                'regularization_method', self._regularization_method,
+                'regularization_coeff', self._regularization_coeff,
+                'multi_precision', find_master)
+            return None
+
         attrs = {
             "mu": self._momentum,
             "use_nesterov": self._use_nesterov,
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 112623d23a65f..7930923668c7d 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -93,7 +93,8 @@ def apply(self, graph):
         graph = self._dequantize_weights(graph)
         graph = self._optimize_fp32_graph(graph)
         graph = self._compute_weight_scales(graph)
-        graph = self._update_relu_output_scales(graph)
+        # This function causes nondeterministic quantization behavior
+        # graph = self._update_relu_output_scales(graph)
         graph = self._propagate_scales(graph)
         graph = self._quantize_fp32_graph(graph)
         graph = self._final_optimizations(graph)
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index e55db665052ce..03503111fca9a 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -92,17 +92,14 @@ function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir da
 		 --ops_to_quantize ${ops_to_quantize})
 endfunction()
 
-function(inference_quant2_int8_lstm_model_test target fp32_model dataset_path)
+function(inference_quant2_int8_lstm_model_test target fp32_model quant_model dataset_path)
     py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_lstm_model.py"
-            ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 FLAGS_use_mkldnn=true
             ARGS --fp32_model ${fp32_model}
+                 --quant_model ${quant_model}
                  --infer_data ${dataset_path}
-                 --num_threads 4
+                 --num_threads 1
                  --mkldnn_cache_capacity 100
                  --warmup_iter 100
-                 --warmup_batch_size 1
                  --acc_diff_threshold 0.11)
 endfunction()
 
@@ -293,11 +290,10 @@ if(LINUX AND WITH_MKLDNN)
 
 	# PTQ int8 lstm model
 	set(LSTM_DATA_ARCHIVE "unittest_model_data/quant_lstm_input_data.tar.gz")
-	set(QUANT2_INT8_LSTM_SAVE_PATH "${QUANT_INSTALL_DIR}/lstm_quant2")
 	download_quant_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_DATA_ARCHIVE} add84c754e9b792fea1fbd728d134ab7)
 	set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz")
 	download_lstm_model(${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE} eecd9f44d69a84acc1cf2235c4b8b743)
-	inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)
+	inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_LSTM_MODEL_DIR}/lstm_quant ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)
 
 endif()
 
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
index 0e33bd8ba1a4e..4f4a2ddd4ab41 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
@@ -20,30 +20,28 @@
 import unittest
 from paddle import fluid
 from paddle.fluid.core import AnalysisConfig, create_paddle_predictor
+from save_quant_model import transform_and_save_int8_model
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         '--fp32_model', type=str, default='', help='A path to a FP32 model.')
-    parser.add_argument('--infer_data', type=str, default='', help='Data file.')
     parser.add_argument(
-        '--num_threads', type=int, default=1, help='Number of threads.')
+        '--quant_model', type=str, default='', help='A path to a quant model.')
+    parser.add_argument('--infer_data', type=str, default='', help='Data file.')
     parser.add_argument(
         '--warmup_iter',
         type=int,
         default=1,
         help='Number of the first iterations to skip in performance statistics.')
-    parser.add_argument(
-        '--warmup_batch_size',
-        type=int,
-        default=1,
-        help='Number of batches to use in PTQ warmup. Default: 1.')
     parser.add_argument(
         '--acc_diff_threshold',
         type=float,
         default=0.01,
         help='Accepted accuracy difference threshold.')
+    parser.add_argument(
+        '--num_threads', type=int, default=1, help='Number of threads.')
     parser.add_argument(
         '--mkldnn_cache_capacity',
         type=int,
@@ -56,7 +54,7 @@ def parse_args():
 
 
 class TestLstmModelPTQ(unittest.TestCase):
-    def get_warmup_tensor(self, data_path, place, warmup_batch_size):
+    def get_warmup_tensor(self, data_path, place):
         data = []
         with open(data_path, 'rb') as in_f:
             while True:
@@ -87,30 +85,31 @@ def get_warmup_tensor(self, data_path, place, warmup_batch_size):
                 infer_label.shape = label.shape
                 infer_label.dtype = fluid.core.PaddleDType.INT32
                 data.append([infer_data, infer_label])
-        warmup_data = data[:warmup_batch_size]
-        inputs = data[warmup_batch_size:]
+        warmup_data = data[:1]
+        inputs = data[1:]
         return warmup_data, inputs
 
     def set_config(self,
                    model_path,
                    num_threads,
                    mkldnn_cache_capacity,
-                   warmup_batch_size,
                    warmup_data=None,
-                   enable_int8=False):
+                   use_analysis=False,
+                   enable_ptq=False):
         config = AnalysisConfig(model_path)
-        config.disable_gpu()
-        config.switch_use_feed_fetch_ops(True)
-        config.switch_ir_optim(True)
         config.set_cpu_math_library_num_threads(num_threads)
-        # This pass to work properly, must be added before fc_fuse_pass
-        config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass")
-        config.enable_mkldnn()
-        config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
-        if enable_int8:
-            config.enable_quantizer()
-            config.quantizer_config().set_quant_data(warmup_data)
-            config.quantizer_config().set_quant_batch_size(warmup_batch_size)
+        if use_analysis:
+            config.disable_gpu()
+            config.switch_use_feed_fetch_ops(True)
+            config.switch_ir_optim(True)
+            config.enable_mkldnn()
+            config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
+            if enable_ptq:
+                # This pass to work properly, must be added before fc_fuse_pass
+                config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass")
+                config.enable_quantizer()
+                config.quantizer_config().set_quant_data(warmup_data)
+                config.quantizer_config().set_quant_batch_size(1)
         return config
 
     def run_program(self,
@@ -119,15 +118,13 @@ def run_program(self,
                     num_threads,
                     mkldnn_cache_capacity,
                     warmup_iter,
-                    warmup_batch_size,
-                    enable_ptq_int8=False):
+                    use_analysis=False,
+                    enable_ptq=False):
         place = fluid.CPUPlace()
-        warmup_data, inputs = self.get_warmup_tensor(data_path, place,
-                                                     warmup_batch_size)
+        warmup_data, inputs = self.get_warmup_tensor(data_path, place)
         warmup_data = [item[0] for item in warmup_data]
         config = self.set_config(model_path, num_threads, mkldnn_cache_capacity,
-                                 warmup_batch_size, warmup_data,
-                                 enable_ptq_int8)
+                                 warmup_data, use_analysis, enable_ptq)
 
         predictor = create_paddle_predictor(config)
         data = [item[0] for item in inputs]
@@ -183,34 +180,47 @@ def test_lstm_model(self):
 
         fp32_model = test_case_args.fp32_model
         assert fp32_model, 'The FP32 model path cannot be empty. Please, use the --fp32_model option.'
+        quant_model = test_case_args.quant_model
+        assert quant_model, 'The quant model path cannot be empty. Please, use the --quant_model option.'
         infer_data = test_case_args.infer_data
         assert infer_data, 'The dataset path cannot be empty. Please, use the --infer_data option.'
         num_threads = test_case_args.num_threads
         mkldnn_cache_capacity = test_case_args.mkldnn_cache_capacity
         warmup_iter = test_case_args.warmup_iter
-        warmup_batch_size = test_case_args.warmup_batch_size
         acc_diff_threshold = test_case_args.acc_diff_threshold
 
         (fp32_hx_acc, fp32_ctc_acc, fp32_fps) = self.run_program(
             fp32_model, infer_data, num_threads, mkldnn_cache_capacity,
-            warmup_iter, warmup_batch_size, False)
+            warmup_iter, False, False)
 
         (int8_hx_acc, int8_ctc_acc, int8_fps) = self.run_program(
             fp32_model, infer_data, num_threads, mkldnn_cache_capacity,
-            warmup_iter, warmup_batch_size, True)
+            warmup_iter, True, True)
+
+        quant_model_save_path = quant_model + "_int8"
+        # transform model to quant2
+        transform_and_save_int8_model(quant_model, quant_model_save_path,
+                                      "fusion_lstm,concat")
 
-        print("FP32: fps {0}, hx_acc {1}, ctc_acc {2}.".format(
+        (quant_hx_acc, quant_ctc_acc, quant_fps) = self.run_program(
+            quant_model_save_path, infer_data, num_threads,
+            mkldnn_cache_capacity, warmup_iter, True, False)
+
+        print("FP32: fps {0}, hx_acc {1}, ctc_acc {2}".format(
             fp32_fps, fp32_hx_acc, fp32_ctc_acc))
 
-        print("PTQ INT8: fps {0}, hx_acc {1}, ctc_acc {2}.".format(
+        print("PTQ_INT8: fps {0}, hx_acc {1}, ctc_acc {2}".format(
             int8_fps, int8_hx_acc, int8_ctc_acc))
 
+        print("QUANT2_INT8: fps {0}, hx_acc {1}, ctc_acc {2}".format(
+            quant_fps, quant_hx_acc, quant_ctc_acc))
+
         sys.stdout.flush()
 
-        hx_delta_value = fp32_hx_acc - int8_hx_acc
-        ctc_delta_value = fp32_ctc_acc - int8_ctc_acc
-        self.assertLess(hx_delta_value, acc_diff_threshold)
-        self.assertLess(ctc_delta_value, acc_diff_threshold)
+        self.assertLess(fp32_hx_acc - int8_hx_acc, acc_diff_threshold)
+        self.assertLess(fp32_ctc_acc - int8_ctc_acc, acc_diff_threshold)
+        self.assertLess(fp32_hx_acc - quant_hx_acc, acc_diff_threshold)
+        self.assertLess(fp32_ctc_acc - quant_ctc_acc, acc_diff_threshold)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index e38148250af21..3fadf25150f9e 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -16,11 +16,6 @@
 import os
 import sys
 import argparse
-import logging
-import struct
-import six
-import numpy as np
-import time
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
@@ -62,7 +57,11 @@ def parse_args():
     return test_args, sys.argv[:1] + args
 
 
-def transform_and_save_int8_model(original_path, save_path):
+def transform_and_save_int8_model(original_path,
+                                  save_path,
+                                  ops_to_quantize='',
+                                  op_ids_to_skip='',
+                                  debug=False):
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     inference_scope = fluid.executor.global_scope()
@@ -75,24 +74,26 @@ def transform_and_save_int8_model(original_path, save_path):
              fetch_targets] = fluid.io.load_inference_model(original_path, exe,
                                                             'model', 'params')
 
-        ops_to_quantize = set()
-        if len(test_args.ops_to_quantize) > 0:
-            ops_to_quantize = set(test_args.ops_to_quantize.split(','))
+        ops_to_quantize_set = set()
+        print(ops_to_quantize)
+        if len(ops_to_quantize) > 0:
+            ops_to_quantize_set = set(ops_to_quantize.split(','))
 
-        op_ids_to_skip = set([-1])
-        if len(test_args.op_ids_to_skip) > 0:
-            op_ids_to_skip = set(map(int, test_args.op_ids_to_skip.split(',')))
+        op_ids_to_skip_set = set([-1])
+        print(op_ids_to_skip)
+        if len(op_ids_to_skip) > 0:
+            op_ids_to_skip_set = set(map(int, op_ids_to_skip.split(',')))
 
         graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
-        if (test_args.debug):
+        if (debug):
             graph.draw('.', 'quant_orig', graph.all_op_nodes())
         transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass(
-            ops_to_quantize,
-            _op_ids_to_skip=op_ids_to_skip,
+            ops_to_quantize_set,
+            _op_ids_to_skip=op_ids_to_skip_set,
             _scope=inference_scope,
             _place=place,
             _core=core,
-            _debug=test_args.debug)
+            _debug=debug)
         graph = transform_to_mkldnn_int8_pass.apply(graph)
         inference_program = graph.to_program()
         with fluid.scope_guard(inference_scope):
@@ -106,5 +107,6 @@ def transform_and_save_int8_model(original_path, save_path):
 if __name__ == '__main__':
     global test_args
     test_args, remaining_args = parse_args()
-    transform_and_save_int8_model(test_args.quant_model_path,
-                                  test_args.int8_model_save_path)
+    transform_and_save_int8_model(
+        test_args.quant_model_path, test_args.int8_model_save_path,
+        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug)
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index 05ea66f54451b..31906c465a074 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -73,8 +73,10 @@ def data(name, shape, dtype='float32', lod_level=0):
     Examples:
         .. code-block:: python
 
+          import paddle
           import paddle.fluid as fluid
           import numpy as np
+          paddle.enable_static()
 
           # Creates a variable with fixed size [3, 2, 1]
           # User can only feed data of the same shape to x
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index cf9d40d7b00c0..438831208b66a 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -727,6 +727,7 @@ def load_into_memory(self, is_shuffle=False):
         Examples:
             .. code-block:: python
 
+              # required: skiptest
               import paddle.fluid as fluid
               dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
               filelist = ["a.txt", "b.txt"]
@@ -753,6 +754,7 @@ def preload_into_memory(self, thread_num=None):
         Examples:
             .. code-block:: python
 
+              # required: skiptest
               import paddle.fluid as fluid
               dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
               filelist = ["a.txt", "b.txt"]
@@ -777,6 +779,7 @@ def wait_preload_done(self):
         Examples:
             .. code-block:: python
 
+              # required: skiptest
               import paddle.fluid as fluid
               dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
               filelist = ["a.txt", "b.txt"]
@@ -797,6 +800,7 @@ def local_shuffle(self):
         Examples:
             .. code-block:: python
 
+              # required: skiptest
               import paddle.fluid as fluid
               dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
               filelist = ["a.txt", "b.txt"]
@@ -819,6 +823,7 @@ def global_shuffle(self, fleet=None, thread_num=12):
         Examples:
             .. code-block:: python
 
+              # required: skiptest
               import paddle.fluid as fluid
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
               dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
@@ -866,6 +871,7 @@ def release_memory(self):
         Examples:
             .. code-block:: python
 
+              # required: skiptest
               import paddle.fluid as fluid
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
               dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
@@ -925,6 +931,7 @@ def get_memory_data_size(self, fleet=None):
         Examples:
             .. code-block:: python
 
+              # required: skiptest
               import paddle.fluid as fluid
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
               dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
@@ -965,6 +972,7 @@ def get_shuffle_data_size(self, fleet=None):
         Examples:
             .. code-block:: python
 
+              # required: skiptest
               import paddle.fluid as fluid
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
               dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 759ce3d16a46a..0d02a383c1bb8 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -19,8 +19,12 @@
 from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags
 import warnings
 import copy
+import functools
+import paddle
+import operator
+import types
 
-__all__ = ['amp_guard']
+__all__ = ['amp_guard', 'amp_decorate']
 
 # The set of ops that support fp16 calculation and are considered numerically-
 # safe and performance-critical. These ops are always converted to fp16.
@@ -64,15 +68,22 @@
     'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
 }
 
+PURE_FP16_BLACK_LIST = {' '}
+PURE_FP16_WHITE_LIST = {'lookup_table', 'lookup_table_v2'}
+
 
 #NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
 # The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
-def _update_list(custom_white_list, custom_black_list):
+def _update_list(custom_white_list, custom_black_list, level='O1'):
     """
     Update black and white list according to users' custom list.
     """
-    _white_list = copy.copy(WHITE_LIST)
-    _black_list = copy.copy(BLACK_LIST)
+    if level == 'O1':
+        _white_list = copy.copy(WHITE_LIST)
+        _black_list = copy.copy(BLACK_LIST)
+    else:
+        _white_list = copy.copy(PURE_FP16_WHITE_LIST)
+        _black_list = copy.copy(PURE_FP16_BLACK_LIST)
     if custom_white_list and custom_black_list:
         for op_name in custom_white_list:
             if op_name in custom_black_list:
@@ -97,48 +108,136 @@ def _in_amp_guard():
     """
     tracer = _dygraph_tracer()
     if tracer:
-        return tracer._enable_autocast
+        if tracer._amp_level == 1:
+            return True
+        else:
+            return False
     else:
         return False
 
 
+@dygraph_only
+def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
+    if not enable_pure_fp16:
+        return models, optimizers
+
+    for idx in range(len(models)):
+        for layer in models[idx].sublayers(include_self=True):
+            layer._casted_by_pure_fp16 = True
+            if len(layer._sub_layers) is 0:
+
+                if (layer._dtype is 'float16') or isinstance(layer, (
+                        paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
+                    continue
+                layer.to(dtype='float16')
+
+    for idx_opt in range(len(optimizers)):
+        # update _param_groups
+        if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
+                optimizers[idx_opt]._param_groups[0], dict):
+            for param_group in optimizers[idx_opt]._param_groups:
+                for i, param in enumerate(param_group['params']):
+                    for idx_model in range(len(models)):
+                        for layer in models[idx_model].sublayers(
+                                include_self=True):
+                            if id(param) in layer._parameters_transform_map:
+                                param_group['params'][
+                                    i] = layer._parameters_transform_map[id(
+                                        param)][0]
+            for param_group in optimizers[idx_opt]._parameter_list:
+                params = param_group['params']
+                for i, param in enumerate(params):
+                    for idx_model in range(len(models)):
+                        for layer in models[idx_model].sublayers(
+                                include_self=True):
+                            if id(param) in layer._parameters_transform_map:
+                                params[i] = layer._parameters_transform_map[id(
+                                    param)][0]
+        # update _parameter_list
+        else:
+            for i, param in enumerate(optimizers[idx_opt]._parameter_list):
+                for idx_model in range(len(models)):
+                    for layer in models[idx_model].sublayers(include_self=True):
+                        if id(param) in layer._parameters_transform_map:
+                            optimizers[idx_opt]._parameter_list[
+                                i] = layer._parameters_transform_map[id(param)][
+                                    0]
+                            if hasattr(optimizers[idx_opt], '_param_groups'):
+                                optimizers[idx_opt]._param_groups[
+                                    i] = layer._parameters_transform_map[id(
+                                        param)][0]
+    return models, optimizers
+
+
+def check_models(models):
+    for model in models:
+        if not isinstance(model, paddle.nn.Layer):
+            raise RuntimeError(
+                "Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {}.".
+                format(type(model)))
+
+
+def check_optimizers(optimizers):
+    for optimizer in optimizers:
+        if not isinstance(optimizer, (paddle.optimizer.Optimizer,
+                                      paddle.fluid.optimizer.Optimizer)):
+            raise RuntimeError(
+                "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".
+                format(type(optimizer)))
+
+
 @signature_safe_contextmanager
 @dygraph_only
-def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
+def amp_guard(enable=True,
+              custom_white_list=None,
+              custom_black_list=None,
+              level='O1'):
     """
     :api_attr: imperative
 
-    Create a context which enables auto-mixed-precision(AMP) of operators executed in imperative mode.
+    Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
     If enabled, the input data type (float32 or float16) of each operator is decided 
     by autocast algorithm for better performance. 
     
-    Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in 
-    imperative mode.
+    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in 
+    imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
 
     Args:
         enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
-        custom_white_list(set|list, optional): The custom white_list.
-        custom_black_list(set|list, optional): The custom black_list.
+        custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
+             fp16 calculation and are considered numerically-safe and performance-critical. These ops 
+             will be converted to fp16.
+        custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
+             calculation and are considered numerically-dangerous and whose effects may also be 
+             observed in downstream ops. These ops will not be converted to fp16.
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; 
+             O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
+
         
     Examples:
 
      .. code-block:: python
 
         import numpy as np
-        import paddle.fluid as fluid
+        import paddle
 
         data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
-        with fluid.dygraph.guard():
-            conv2d = fluid.dygraph.Conv2D(3, 2, 3)
-            data = fluid.dygraph.to_variable(data)
-            with fluid.dygraph.amp_guard():
+        with paddle.fluid.dygraph.guard():
+            conv2d = paddle.fluid.dygraph.Conv2D(3, 2, 3)
+            data = paddle.fluid.dygraph.to_variable(data)
+            with paddle.fluid.dygraph.amp_guard():
                 conv = conv2d(data)
                 print(conv.dtype) # FP16
-            with fluid.dygraph.amp_guard(enable=False):
+            with paddle.fluid.dygraph.amp_guard(enable=False):
                 conv = conv2d(data)
                 print(conv.dtype) # FP32
 
     """
+    if not (level in ['O1', 'O2']):
+        raise ValueError(
+            "level should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode."
+        )
+
     tracer = _dygraph_tracer()
     if not tracer:
         raise ValueError(
@@ -151,17 +250,27 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
             % tracer._expected_place)
         enable = False
 
-    # use default white_list and black_list if no custom lists provided
-    _white_list = WHITE_LIST
-    _black_list = BLACK_LIST
+    if level == 'O1':
+        amp_level = 1
+        _white_list = WHITE_LIST
+        _black_list = BLACK_LIST
+    else:
+        amp_level = 2
+        _white_list = PURE_FP16_WHITE_LIST
+        _black_list = PURE_FP16_BLACK_LIST
+
     if custom_white_list or custom_black_list:
         _white_list, _black_list = _update_list(custom_white_list,
-                                                custom_black_list)
+                                                custom_black_list, level)
+
+    if not enable:
+        amp_level = 0
 
     if tracer:
         # enable auto_cast
-        original_enable = tracer._enable_autocast
-        tracer._enable_autocast = enable
+        original_amp_level = tracer._amp_level
+        tracer._amp_level = amp_level
+
         # set amp op list
         original_white_list, original_black_list = tracer._get_amp_op_list()
         tracer._set_amp_op_list(_white_list, _black_list)
@@ -179,6 +288,140 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
         yield
     finally:
         if tracer:
-            tracer._enable_autocast = original_enable
+            tracer._amp_level = original_amp_level
             tracer._set_amp_op_list(original_white_list, original_black_list)
             # set_flags(original_flags)
+
+
+class StateDictHook(object):
+    def __init__(self, save_dtype):
+        self._save_dtype = save_dtype
+
+    def __call__(self, state_dict):
+        for key in state_dict:
+            param = state_dict[key]
+            with paddle.fluid.dygraph.guard():
+                param_applied = paddle.cast(param, self._save_dtype)
+                param_applied.name = param.name
+                state_dict[key] = param_applied
+
+
+@dygraph_only
+def amp_decorate(models,
+                 optimizers=None,
+                 level='O1',
+                 master_weight=None,
+                 save_dtype=None):
+    """
+    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing. 
+    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
+    
+    Commonly, it is used together with `amp_guard` to achieve Pure fp16 in imperative mode.
+
+    Args:
+        models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
+        optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing; 
+             O2 represent Pure fp16, the decorator will cast all parameters of models to FP16, except BatchNorm and LayerNorm. Default is O1(amp)
+        master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
+        save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, float32, float64 or None.
+             The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
+
+    Examples:
+
+     .. code-block:: python   
+        
+        # required: gpu
+        # Demo1: single model and optimizer:
+        import paddle
+
+        model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
+        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+
+        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2')
+
+        data = paddle.rand([10, 3, 32, 32])
+
+        with paddle.fluid.dygraph.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
+            output = model(data)
+            print(output.dtype) # FP16
+
+        # required: gpu
+        # Demo2: multi models and optimizers:
+        model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
+        optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
+
+        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+
+        data = paddle.rand([10, 3, 32, 32])
+
+        with paddle.fluid.dygraph.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
+            output = models[0](data)
+            output2 = models[1](data)
+            print(output.dtype) # FP16
+            print(output2.dtype) # FP16
+    """
+    if not (level in ['O1', 'O2']):
+        raise ValueError(
+            "level should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode."
+        )
+
+    if level == 'O1':
+        return models, optimizers
+
+    models_is_list = False
+    if isinstance(models, paddle.nn.Layer):
+        models_is_list = False
+        models = [models]
+        check_models(models)
+    elif isinstance(models, list):
+        check_models(models)
+        models_is_list = True
+    else:
+        raise TypeError(
+            "models must be either a single model or a list of models.")
+
+    optimizers_is_list = False
+    if isinstance(optimizers, (paddle.optimizer.Optimizer,
+                               paddle.fluid.optimizer.Optimizer)):
+        optimizers_is_list = False
+        optimizers = [optimizers]
+        check_optimizers(optimizers)
+    elif isinstance(optimizers, list):
+        check_optimizers(optimizers)
+        optimizers_is_list = True
+    else:
+        raise TypeError(
+            "optimizers must be either a single optimizer or a list of optimizers."
+        )
+
+    models, optimizers = pure_fp16_initialize(
+        enable_pure_fp16=True, models=models, optimizers=optimizers)
+
+    # supprot master_weight    
+    for idx_opt in range(len(optimizers)):
+        if hasattr(optimizers[idx_opt], '_multi_precision'):
+            if master_weight is False:
+                optimizers[idx_opt]._multi_precision = False
+            else:
+                optimizers[idx_opt]._multi_precision = True
+
+    if save_dtype is not None:
+        if not (save_dtype in ['float16', 'float32', 'float64']):
+            raise ValueError(
+                "save_dtype can only be float16 float32 or float64, but your input save_dtype is %s."
+                % save_dtype)
+        for idx in range(len(models)):
+            for layer in models[idx].sublayers(include_self=True):
+                layer.register_state_dict_hook(StateDictHook(save_dtype))
+
+    if models_is_list:
+        if optimizers_is_list:
+            return models, optimizers
+        else:
+            return models, optimizers[0]
+    else:
+        if optimizers_is_list:
+            return models[0], optimizers
+        else:
+            return models[0], optimizers[0]
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index a9fe2c9f3ed7b..432b178ea6706 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -21,8 +21,20 @@
 import warnings
 import numpy as np
 from paddle import _C_ops
+from collections import defaultdict
+from enum import Enum
 
-__all__ = ['AmpScaler']
+__all__ = ['AmpScaler', 'OptimizerState']
+
+
+class OptimizerState(Enum):
+    INIT = 0
+    UNSCALED = 1
+    STEPPED = 2
+
+
+def _refresh_optimizer_state():
+    return {"state": OptimizerState.INIT}
 
 
 class AmpScaler(object):
@@ -31,10 +43,11 @@ class AmpScaler(object):
 
     AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative
     mode. It controls the scaling of loss, helps avoiding numerical overflow.
-    The object of this class has two methods `scale()`, `minimize()`.
+    The object of this class has seventeen methods `scale()`, `unscale_()`, `minimize()` and `get`/`set` api of parameters.
 
     `scale()` is used to multiply the loss by a scale ratio.
-    `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
+    `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
+    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling.
 
     Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in 
     imperative mode.
@@ -117,6 +130,7 @@ def __init__(self,
             self._scale = to_variable(
                 np.array([self._init_loss_scaling]).astype(np.float32))
             self._cache_founf_inf = None
+            self._optimizer_states = defaultdict(_refresh_optimizer_state)
 
     def scale(self, var):
         """
@@ -129,24 +143,25 @@ def scale(self, var):
             The scaled variable or original variable.
         
         Examples:
+
             .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
-
-            data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
-            with fluid.dygraph.guard():
-                model = fluid.dygraph.Conv2D(3, 2, 3)
-                optimizer = fluid.optimizer.SGDOptimizer(
-                        learning_rate=0.01, parameter_list=model.parameters())
-                scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
-                data = fluid.dygraph.to_variable(data)
-                with fluid.dygraph.amp_guard():
-                    conv = model(data)
-                    loss = fluid.layers.reduce_mean(conv)
-                    scaled = scaler.scale(loss)
-                    scaled.backward()
-                    scaler.minimize(optimizer, scaled) 
+                import numpy as np
+                import paddle.fluid as fluid
+
+                data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+                with fluid.dygraph.guard():
+                    model = fluid.dygraph.Conv2D(3, 2, 3)
+                    optimizer = fluid.optimizer.SGDOptimizer(
+                            learning_rate=0.01, parameter_list=model.parameters())
+                    scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+                    data = fluid.dygraph.to_variable(data)
+                    with fluid.dygraph.amp_guard():
+                        conv = model(data)
+                        loss = fluid.layers.reduce_mean(conv)
+                        scaled = scaler.scale(loss)
+                        scaled.backward()
+                        scaler.minimize(optimizer, scaled) 
         """
         check_type(var, "var", core.VarBase, 'AmpScaler.scale()')
 
@@ -160,7 +175,7 @@ def minimize(self, optimizer, *args, **kwargs):
         This function is similar as `Optimizer.minimize()`, which performs parameters updating.
         
         If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
-        Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
+        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.
 
         Finally, the loss scaling ratio is updated.
 
@@ -170,30 +185,34 @@ def minimize(self, optimizer, *args, **kwargs):
             kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
 
         Examples:
+
             .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
-
-            data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
-            with fluid.dygraph.guard():
-                model = fluid.dygraph.Conv2D(3, 2, 3)
-                optimizer = fluid.optimizer.SGDOptimizer(
-                        learning_rate=0.01, parameter_list=model.parameters())
-                scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
-                data = fluid.dygraph.to_variable(data)
-                with fluid.dygraph.amp_guard():
-                    conv = model(data)
-                    loss = fluid.layers.reduce_mean(conv)
-                    scaled = scaler.scale(loss)
-                    scaled.backward()
-                    scaler.minimize(optimizer, scaled) 
+                import numpy as np
+                import paddle.fluid as fluid
+
+                data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+                with fluid.dygraph.guard():
+                    model = fluid.dygraph.Conv2D(3, 2, 3)
+                    optimizer = fluid.optimizer.SGDOptimizer(
+                            learning_rate=0.01, parameter_list=model.parameters())
+                    scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+                    data = fluid.dygraph.to_variable(data)
+                    with fluid.dygraph.amp_guard():
+                        conv = model(data)
+                        loss = fluid.layers.reduce_mean(conv)
+                        scaled = scaler.scale(loss)
+                        scaled.backward()
+                        scaler.minimize(optimizer, scaled) 
         """
         if not self._enable:
             return optimizer.minimize(*args, **kwargs)
 
+        optimizer_state = self._optimizer_states[id(optimizer)]
+
         #  unscale the grad
-        self._unscale(optimizer)
+        if optimizer_state["state"] is OptimizerState.INIT:
+            self._unscale(optimizer)
 
         optimize_ops, params_grads = (None, None)
 
@@ -207,26 +226,75 @@ def minimize(self, optimizer, *args, **kwargs):
             # uopdate the scale
             self._update()
 
+        self._optimizer_states = defaultdict(_refresh_optimizer_state)
+
         return optimize_ops, params_grads
 
     def _unscale(self, optimizer):
+        """
+        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).  
+        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
+        Args:
+            optimizer(Optimizer):  The optimizer used to update parameters.
+        Returns:
+            The unscaled parameters or original parameters.
+        """
         if not self._enable:
             return
 
+        optimizer_state = self._optimizer_states[id(optimizer)]
+
+        if optimizer_state["state"] is OptimizerState.UNSCALED:
+            raise RuntimeError(
+                "unscale_() has already been called on this optimizer since the last update()."
+            )
+        elif optimizer_state["state"] is OptimizerState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+
         if getattr(optimizer, '_param_groups', None) and isinstance(
                 optimizer._param_groups[0], dict):
             param_grads = []
+            param_grads_fp16 = []
+            param_grads_fp32 = []
             for group in optimizer._param_groups:
                 for param in group['params']:
                     if param._grad_ivar() is not None:
                         param_grads.append(param._grad_ivar())
+                        if param._grad_ivar(
+                        ).dtype == core.VarDesc.VarType.FP16:
+                            param_grads_fp16.append(param._grad_ivar())
+                        else:
+                            param_grads_fp32.append(param._grad_ivar())
         else:
             param_grads = [
                 param._grad_ivar() for param in optimizer._parameter_list
                 if param._grad_ivar() is not None
             ]
-        _C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
-                                        self._found_inf)
+            param_grads_fp16 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
+                           )
+            ]
+            param_grads_fp32 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
+                           )
+            ]
+        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
+        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+        if len(param_grads_fp16):
+            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                            param_grads_fp16,
+                                            temp_found_inf_fp16)
+        if len(param_grads_fp32):
+            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                            param_grads_fp32,
+                                            temp_found_inf_fp32)
+        self._found_inf = temp_found_inf_fp16 or temp_found_inf_fp32
+
+        optimizer_state["state"] = OptimizerState.UNSCALED
 
     def _update(self):
         """
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 10c3861e7746a..d41c373bf5093 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -779,10 +779,11 @@ def fun(inputs):
 
         dygraph_state_dict = None
         if isinstance(inner_layer, Layer):
-            dygraph_state_dict = inner_layer.state_dict()
+            dygraph_state_dict = inner_layer.to_static_state_dict()
         elif isinstance(attr_func, StaticFunction):
             if attr_func._class_instance:
-                dygraph_state_dict = attr_func._class_instance.state_dict()
+                dygraph_state_dict = attr_func._class_instance.to_static_state_dict(
+                )
 
         if dygraph_state_dict:
             # NOTE(chenweihang): we maintain the mapping of variable name to
@@ -790,15 +791,19 @@ def fun(inputs):
             # saved to inference program may not need by dygraph Layer,
             # we only record the state_dict variable's structured name
             state_names_dict = dict()
+            state_var_dict = dict()
             for structured_name, var in six.iteritems(dygraph_state_dict):
                 state_names_dict[var.name] = structured_name
+                state_var_dict[var.name] = var
 
             # 3. share parameters from Layer to scope & record var info
             for param_or_buffer in concrete_program.parameters:
                 # share to scope
                 param_or_buffer_tensor = scope.var(
                     param_or_buffer.name).get_tensor()
-                src_tensor = param_or_buffer.value().get_tensor()
+                #src_tensor = param_or_buffer.value().get_tensor()
+                src_tensor = state_var_dict[param_or_buffer.name].value(
+                ).get_tensor()
                 param_or_buffer_tensor._share_data_with(src_tensor)
                 # record var info
                 if param_or_buffer.name not in extra_var_info:
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index cb7666b353db7..30d5ee44171f3 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -121,6 +121,13 @@ def __init__(self, name_scope=None, dtype="float32"):
         self._forward_pre_hooks = collections.OrderedDict()
         self._forward_post_hooks = collections.OrderedDict()
 
+        self._parameters_transform_map = {}
+        self._buffers_transform_map = {}
+
+        self._casted_by_pure_fp16 = False
+
+        self._state_dict_hooks = collections.OrderedDict()
+
     def train(self):
         """
         Sets this Layer and all its sublayers to training mode.
@@ -1259,6 +1266,87 @@ def __repr__(self):
         final_str += ')'
         return final_str
 
+    def register_state_dict_hook(self, hook):
+        hook_remove_helper = HookRemoveHelper(self._state_dict_hooks)
+        self._state_dict_hooks[hook_remove_helper._hook_id] = hook
+        return hook_remove_helper
+
+    def _state_dict_impl(self,
+                         destination=None,
+                         include_sublayers=True,
+                         structured_name_prefix="",
+                         include_non_persistable_buffer=False):
+        """
+        Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
+
+        Parameters:
+            destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
+            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
+            include_non_persistable_buffer(bool, optional): If true, include non persistable buffers of current layer and its sub-layers, it is used in pure fp16 and jit.save. Default: False
+        """
+
+        if destination is None:
+            destination = collections.OrderedDict()
+        for name, data in self._parameters.items():
+            if data is not None:
+                destination[structured_name_prefix + name] = data
+        for name, buffer in self._buffers.items():
+            if not include_non_persistable_buffer:
+                if buffer is not None and name not in self._non_persistable_buffer_names_set:
+                    destination[structured_name_prefix + name] = buffer
+            else:
+                if buffer is not None:
+                    destination[structured_name_prefix + name] = buffer
+
+        if include_sublayers:
+            for layer_name, layer_item in self._sub_layers.items():
+                if layer_item is not None:
+                    destination_temp = destination.copy()
+                    destination_temp.update(
+                        layer_item._state_dict_impl(
+                            destination_temp, include_sublayers,
+                            structured_name_prefix + layer_name + ".",
+                            include_non_persistable_buffer))
+                    destination = destination_temp
+
+        for state_dict_hook in self._state_dict_hooks.values():
+            hook_result = state_dict_hook(destination)
+            if hook_result is not None:
+                destination = hook_result
+
+        return destination
+
+    def to_static_state_dict(self,
+                             destination=None,
+                             include_sublayers=True,
+                             structured_name_prefix=""):
+        '''
+        Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
+
+        Parameters:
+            destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
+            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
+            
+        Retruns:
+            dict: a dict contains all the parameters and persistable buffers.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                emb = paddle.nn.Embedding(10, 10)
+
+                state_dict = emb.to_static_state_dict()
+                paddle.save( state_dict, "paddle_dy.pdparams")
+
+        '''
+        return self._state_dict_impl(
+            destination=destination,
+            include_sublayers=include_sublayers,
+            structured_name_prefix=structured_name_prefix,
+            include_non_persistable_buffer=True)
+
     def state_dict(self,
                    destination=None,
                    include_sublayers=True,
@@ -1269,7 +1357,7 @@ def state_dict(self,
         Parameters:
             destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
             include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
-
+            
         Retruns:
             dict: a dict contains all the parameters and persistable buffers.
 
@@ -1284,26 +1372,11 @@ def state_dict(self,
                 paddle.save( state_dict, "paddle_dy.pdparams")
 
         '''
-
-        if destination is None:
-            destination = collections.OrderedDict()
-        for name, data in self._parameters.items():
-            if data is not None:
-                destination[structured_name_prefix + name] = data
-        for name, buffer in self._buffers.items():
-            if buffer is not None and name not in self._non_persistable_buffer_names_set:
-                destination[structured_name_prefix + name] = buffer
-
-        if include_sublayers:
-            for layer_name, layer_item in self._sub_layers.items():
-                if layer_item is not None:
-                    destination_temp = destination.copy()
-                    destination_temp.update(
-                        layer_item.state_dict(
-                            destination_temp, include_sublayers,
-                            structured_name_prefix + layer_name + "."))
-                    destination = destination_temp
-        return destination
+        return self._state_dict_impl(
+            destination=destination,
+            include_sublayers=include_sublayers,
+            structured_name_prefix=structured_name_prefix,
+            include_non_persistable_buffer=False)
 
     @framework.deprecate_stat_dict
     def set_state_dict(self, state_dict, use_structured_name=True):
@@ -1404,8 +1477,11 @@ def _apply(self, func, device, dtype, blocking):
                         ).stop_gradient
                         self._parameters[key]._set_grad_ivar(grad_applied)
 
+            self._parameters_transform_map[id(param)] = [param_applied, key]
+
         for key, buf in self._buffers.items():
             self._buffers[key] = func(buf, device, dtype, blocking)
+            self._buffers_transform_map[id(buf)] = [self._buffers[key], key]
 
     def to(self, device=None, dtype=None, blocking=None):
         '''
@@ -1501,6 +1577,7 @@ def transform(t, device, dtype, blocking):
             return new_t
 
         self._apply(transform, device, dtype, blocking)
+        self._dtype = dtype
 
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 8cd8bc39941c5..4c7537d8d5c8e 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -540,19 +540,24 @@ def _update_feed(self, feed):
         Returns:
             feed:(list|dict)  updated feed.
         """
-        global_block = self._main_program.global_block()
         if feed is None:
             feed = {}
-        elif isinstance(feed, dict):
-            for feed_name in list(feed.keys()):
-                if not global_block.has_var(feed_name):
-                    feed.pop(feed_name)
-                    warnings.warn(
-                        "The variable %s is not found in program. It is not declared or is pruned."
-                        % feed_name)
-        else:
-            raise TypeError("Only support feed with `dict`, but received {}".
-                            format(type(feed).__name__))
+        elif isinstance(feed, (list, tuple)):
+            assert len(feed) == 1, "Not compiled with data parallel"
+            feed = feed[0]
+
+        if not isinstance(feed, dict):
+            raise TypeError(
+                "feed requires dict as its Parameter. But you passed in %s" %
+                (type(feed)))
+
+        global_block = self._main_program.global_block()
+        for feed_name in list(feed.keys()):
+            if not global_block.has_var(feed_name):
+                feed.pop(feed_name)
+                warnings.warn(
+                    "The variable %s is not found in program. It is not declared or is pruned."
+                    % feed_name)
 
         return feed
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 92afe0fdaff4d..11e7e7c2f7c08 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5021,6 +5021,22 @@ def _prune_with_input(self, feeded_var_names, targets):
                     "All feeded_var_names of Program._prune_with_input() can only be "
                     "str, but received %s." % type(var))
 
+        # find out all variables that can be generated or updated with given feed
+        generatable_vars = set()
+
+        for idx, op in enumerate(self.global_block().ops):
+            runnable_op = True
+            for name in op.input_arg_names:
+                if not self.global_block().has_var(name):
+                    continue
+                if self.global_block().var(name).persistable:
+                    continue
+                if name not in generatable_vars.union(feeded_var_names):
+                    runnable_op = False
+                    break
+            if runnable_op:
+                generatable_vars = generatable_vars.union(op.output_arg_names)
+
         targets_idx = []
         for t in targets:
             if not isinstance(t, Operator):
@@ -5038,7 +5054,9 @@ def _prune_with_input(self, feeded_var_names, targets):
                 # (2) the variable is not leaf, and we need to prune the op that generates it.
                 # In both cases, wo can just skip target_op of that it.
                 if name in feeded_var_names:
-                    continue
+                    # however if the var is also updated by a runnable op, will shall keep it
+                    if name not in generatable_vars:
+                        continue
 
                 # After transpiler processing, the op that output this
                 # variable maybe has been changed, so t.op is not reliable
@@ -5055,7 +5073,7 @@ def _prune_with_input(self, feeded_var_names, targets):
                             continue
                         else:
                             target_op = op
-                            break
+
                 if target_op is None:
                     raise ValueError(
                         "The target variable used for pruning should have an "
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 89b2a8237dc65..2874949e3c9b8 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -111,9 +111,104 @@ def _get_pull_sparse_ops(_program):
         return pull_sparse_ops
 
     def _pull_sparse_fuse(_program, pull_sparse_ops, use_ps_gpu):
+        def dag_check_up_and_reorder(program, inputs, outputs):
+            global_block = program.global_block()
+            min_output_index = len(global_block.ops)
+            max_input_index = -1
+            input_indexes = [0] * len(global_block.ops)
+            output_indexes = [0] * len(global_block.ops)
+            for idx, op in enumerate(global_block.ops):
+                for i in range(0, len(op.output_names)):
+                    if input_indexes[idx] == 1:
+                        break
+                    outs = op.output(op.output_names[i])
+                    for in_id, in_var in enumerate(inputs):
+                        if in_var.name in outs:
+                            input_indexes[idx] = 1
+                            max_input_index = max(max_input_index, idx)
+                            break
+
+                for i in range(0, len(op.input_names)):
+                    if output_indexes[idx] == 1:
+                        break
+                    ins = op.input(op.input_names[i])
+                    for out_id, out_var in enumerate(outputs):
+                        if out_var.name in ins:
+                            output_indexes[idx] = 1
+                            min_output_index = min(min_output_index, idx)
+
+            for i in range(len(global_block.ops)):
+                if input_indexes[i] == 1 and output_indexes[i] == 1:
+                    warnings.warn(
+                        "unable to re-arrange dags order to combine distributed embedding ops because a op both needs embedding table's output as input and produces ids as the same embedding table's input"
+                    )
+                    return
+
+            if min_output_index < max_input_index:
+                move_ops = []
+                for i in range(min_output_index + 1, len(input_indexes)):
+                    if input_indexes[i] == 1:
+                        move_ops.append((global_block.ops[i], i))
+                for i, op in enumerate(move_ops):
+                    queue = list()
+                    visited = set()
+                    queue.append(op[1])
+                    visited.add(op[0])
+                    start = 0
+                    while start < len(queue):
+                        pos = queue[start]
+                        op = global_block.ops[pos]
+                        op_inputs = []
+                        for k in range(0, len(op.input_names)):
+                            ins = op.input(op.input_names[k])
+                            op_inputs.append(ins)
+                        for j in range(pos - 1, min_output_index - 1, -1):
+                            op1 = global_block.ops[j]
+                            if op1 in visited:
+                                continue
+                            found = False
+                            for k in range(0, len(op1.output_names)):
+                                outs = op1.output(op1.output_names[k])
+                                for t in range(len(op_inputs)):
+                                    for y in op_inputs[t]:
+                                        if y in outs:
+                                            found = True
+                                            break
+                                    if found:
+                                        break
+                                if found:
+                                    break
+                            if found:
+                                if output_indexes[j] == True:
+                                    warnings.warn(
+                                        "unable to re-arrange dags order to combine distributed embedding ops"
+                                    )
+                                    return
+                                queue.append(j)
+                                visited.add(global_block.ops[j])
+                        start = start + 1
+
+                    queue.sort()
+                    for index in queue:
+                        desc = global_block.desc._insert_op(min_output_index)
+                        desc.copy_from(global_block.ops[index].desc)
+                        global_block.desc._remove_op(index + 1, index + 2)
+                        global_block.ops[index].desc = desc
+                        insert_op = global_block.ops.pop(index)
+                        input_state = input_indexes.pop(index)
+                        output_state = output_indexes.pop(index)
+                        global_block.ops.insert(min_output_index, insert_op)
+                        input_indexes.insert(min_output_index, input_state)
+                        output_indexes.insert(min_output_index, output_state)
+                        min_output_index = min_output_index + 1
+
+                assert global_block.desc.op_size() == len(global_block.ops)
+                for i in range(len(global_block.ops)):
+                    assert global_block.desc.op(i) == global_block.ops[i].desc
+
         for param, ops in pull_sparse_ops.items():
             all_ops = program.global_block().ops
-            op_idxs = [all_ops.index(op) for op in ops]
+
             inputs = [
                 program.global_block().vars[op.input("Ids")[0]] for op in ops
             ]
@@ -139,23 +234,28 @@ def _pull_sparse_fuse(_program, pull_sparse_ops, use_ps_gpu):
                 program.global_block().vars[op.output("Out")[0]] for op in ops
             ]
 
+            dag_check_up_and_reorder(program, inputs, outputs)
+
+            op_idxs = [all_ops.index(op) for op in ops]
+
             for idx in op_idxs[::-1]:
                 program.global_block()._remove_op(idx)
 
             inputs_idxs = [-1] * len(inputs)
-            outputs_idxs = [-1] * len(outputs)
+            outputs_idxs = [len(program.global_block().ops) + 1] * len(outputs)
 
             for idx, op in enumerate(program.global_block().ops):
                 for i in range(0, len(op.output_names)):
                     outs = op.output(op.output_names[i])
                     for in_id, in_var in enumerate(inputs):
                         if in_var.name in outs:
-                            inputs_idxs[in_id] = idx
+                            inputs_idxs[in_id] = max(idx, inputs_idxs[in_id])
                 for i in range(0, len(op.input_names)):
                     ins = op.input(op.input_names[i])
                     for out_id, out_var in enumerate(outputs):
                         if out_var.name in ins:
-                            outputs_idxs[out_id] = idx
+                            outputs_idxs[out_id] = min(idx,
+                                                       outputs_idxs[out_id])
 
             if min(outputs_idxs) - max(inputs_idxs) >= 1:
                 distributed_idx = max(inputs_idxs) + 1
@@ -187,7 +287,7 @@ def _pull_sparse_fuse(_program, pull_sparse_ops, use_ps_gpu):
                         })
             else:
                 for i in range(len(inputs_idxs)):
-                    distributed_idx = op_idxs[i] + 1
+                    distributed_idx = op_idxs[i]
 
                     program.global_block()._insert_op(
                         index=distributed_idx,
@@ -557,7 +657,6 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
 
 def create_heter_program(program, config, heter_program, heter_ops,
                          block_var_detail, current_device):
-
     # This function mainly includes the following contents:
     # 1. For every heter block:
     #     a) copy heter device op from origin program
@@ -1029,7 +1128,6 @@ def insert_send_concat_op(program, block, index, var_name_list, new_var_name,
 
 def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
                          type, new_var_name_list, new_var_shape_list):
-
     if var_name not in program.global_block().vars:
         input_var = program.global_block().create_var(
             name=var_name, shape=var_shape, dtype=dtype, type=type)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 39cf3ebeb32a9..e8d9cc3b77b6a 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -327,6 +327,21 @@ def print_table_stat(self, table_id):
             self._fleet_ptr.print_table_stat(table_id)
         self._role_maker._barrier_worker()
 
+    def set_file_num_one_shard(self, table_id, file_num):
+        """
+        set file_num in one shard
+        Args:
+            table_id(int): the id of table
+            file_num(int): file num in one shard
+        Example:
+            .. code-block:: python
+              fleet.set_file_num_one_shard(0, 5)
+        """
+        self._role_maker._barrier_worker()
+        if self._role_maker.is_first_worker():
+            self._fleet_ptr.set_file_num_one_shard(table_id, file_num)
+        self._role_maker._barrier_worker()
+
     def save_persistables(self, executor, dirname, main_program=None, **kwargs):
         """
         save presistable parameters,
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index d7a8e3bcb8252..a83a230132300 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -98,7 +98,10 @@ def one_hot(input, depth, allow_out_of_range=False):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
+
             # Correspond to the first example above, where label.shape is 4 and one_hot_label.shape is [4, 4].
             label = fluid.data(name="label", shape=[4], dtype="int64")
             one_hot_label = fluid.one_hot(input=label, depth=4)
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 417e5ace8c191..f050b3995be96 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1042,7 +1042,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
 def load_persistables(executor, dirname, main_program=None, filename=None):
     """
     :api_attr: Static Graph
-    
+
     This API filters out all variables with ``persistable==True`` from the
     given ``main_program`` and then tries to load these variables from the
     directory ``dirname`` or the file ``filename``.
@@ -1373,15 +1373,9 @@ def save_inference_model(dirname,
             )
             break
 
-    # fix the bug that the activation op's output as target will be pruned.
-    # will affect the inference performance.
-    # TODO(Superjomn) add an IR pass to remove 1-scale op.
     with program_guard(main_program):
         uniq_target_vars = []
         for i, var in enumerate(target_vars):
-            if isinstance(var, Variable) and var.dtype != paddle.bool:
-                var = layers.scale(
-                    var, 1., name="save_infer_model/scale_{}".format(i))
             uniq_target_vars.append(var)
         target_vars = uniq_target_vars
     target_var_name_list = [var.name for var in target_vars]
@@ -1427,6 +1421,13 @@ def save_inference_model(dirname,
         main_program = main_program._inference_optimize(prune_read_op=True)
         fetch_var_names = [v.name for v in target_vars]
 
+        for target_v in target_vars:
+            if not main_program.global_block().has_var(target_v.name):
+                main_program.global_block().create_var(
+                    name=target_v.name,
+                    shape=target_v.shape,
+                    dtype=target_v.dtype)
+
         prepend_feed_ops(main_program, feeded_var_names)
         append_fetch_ops(main_program, fetch_var_names)
 
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 0954fe7f548d3..eaac99fc5b592 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 from functools import partial, reduce
+import paddle
 from paddle.utils import deprecated
 from . import nn
 from .layer_function_generator import templatedoc
@@ -1717,7 +1718,7 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
     batch_size = labels.shape[0]
 
     labels = nn.reshape(labels, shape=[batch_size, 1])
-    labels = nn.expand(labels, expand_times=[1, batch_size])
+    labels = paddle.tile(labels, repeat_times=[1, batch_size])
 
     labels = equal(labels, nn.transpose(labels, perm=[1, 0])).astype('float32')
     labels = labels / nn.reduce_sum(labels, dim=1, keep_dim=True)
@@ -1726,7 +1727,7 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
              + nn.reduce_mean(nn.reduce_sum(square(positive), 1))
     l2loss = l2loss * Beta * l2_reg
 
-    similarity_matrix = nn.matmul(
+    similarity_matrix = paddle.matmul(
         anchor, positive, transpose_x=False, transpose_y=True)
     softmax_ce = softmax_with_cross_entropy(
         logits=similarity_matrix, label=labels, soft_label=True)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 13a1c8e079d42..515d4a5c0ef7c 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4521,7 +4521,10 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
+
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
@@ -5061,7 +5064,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
         
         X = paddle.randn(shape=[3, 5], dtype='float64')
         out = paddle.fluid.layers.l2_normalize(X, axis=-1)
-        print(out.numpy())
+        print(out)
 
         # [[ 0.21558504  0.56360189  0.47466096  0.46269539 -0.44326736]
         #  [-0.70602414 -0.52745777  0.37771788 -0.2804768  -0.04449922]
@@ -5160,7 +5163,10 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
             # x: [M], y: [N]
             # fluid.layers.matmul(x, y, True, True)  # out: [M, N]
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
+
             x = fluid.layers.data(name='x', shape=[2, 3], dtype='float32')
             y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
             out = fluid.layers.matmul(x, y, True, True)
@@ -5499,12 +5505,12 @@ def transpose(x, perm, name=None):
     perm[i]-th dimension of `input`.
 
     Args:
-        x (Tensor): The input Tensor. It is a N-D Tensor of data types float32, float64, int32.
+        x (Tensor): The input Tensor. It is a N-D Tensor of data types bool, float32, float64, int32.
         perm (list|tuple): Permute the input according to the data of perm.
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Tensor: A transposed n-D Tensor, with data type being float32, float64, int32, int64.
+        Tensor: A transposed n-D Tensor, with data type being bool, float32, float64, int32, int64.
 
     For Example:
 
@@ -5546,7 +5552,7 @@ def transpose(x, perm, name=None):
         return out
 
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
         'transpose')
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
@@ -5999,7 +6005,10 @@ def one_hot(input, depth, allow_out_of_range=False):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
+
             # Correspond to the first example above, where label.shape is [4, 1] and one_hot_label.shape is [4, 4].
             label = fluid.data(name="label", shape=[4, 1], dtype="int64")
             one_hot_label = fluid.layers.one_hot(input=label, depth=4)
@@ -6727,8 +6736,10 @@ def pad(x, paddings, pad_value=0., name=None):
             x = fluid.data(name='data', shape=[300, 300], dtype='float32')
             out = fluid.layers.pad(x=x, paddings=[0, 1, 1, 2], pad_value=0.)
     """
-    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], "pad")
+    check_variable_and_dtype(x, 'x', [
+        'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+        'complex128'
+    ], "pad")
 
     helper = LayerHelper('pad', **locals())
     dtype = helper.input_dtype(input_param_name='x')
@@ -8361,7 +8372,10 @@ def gather(input, index, overwrite=True):
 
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
+
             x = fluid.data(name='x', shape=[-1, 5], dtype='float32')
             index = fluid.data(name='index', shape=[-1, 1], dtype='int32')
             output = fluid.layers.gather(x, index)
@@ -8451,7 +8465,10 @@ def gather_nd(input, index, name=None):
 
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
+
             x = fluid.data(name='x', shape=[3, 4, 5], dtype='float32')
             index = fluid.data(name='index', shape=[2, 2], dtype='int32')
             output = fluid.layers.gather_nd(x, index)
@@ -8486,6 +8503,7 @@ def scatter(input, index, updates, name=None, overwrite=True):
     Output is obtained by updating the input on selected indices based on updates.
 
     .. code-block:: python
+
         import numpy as np
 
         #input:
@@ -8527,8 +8545,10 @@ def scatter(input, index, updates, name=None, overwrite=True):
 
         .. code-block:: python
 
+            import paddle
             import numpy as np
             import paddle.fluid as fluid
+            paddle.enable_static()
 
             input = fluid.layers.data(name='data', shape=[3, 2], dtype='float32', append_batch_size=False)
             index = fluid.layers.data(name='index', shape=[4], dtype='int64', append_batch_size=False)
@@ -8869,8 +8889,10 @@ def selu(x, scale=None, alpha=None, name=None):
 
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
             import numpy as np
+            paddle.enable_static()
 
             inputs = fluid.layers.data(name="x", shape=[2, 2], dtype="float32")
             output = fluid.layers.selu(inputs)
@@ -10478,22 +10500,24 @@ def expand_as(x, target_tensor, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            paddle.enable_static()
 
-        data = fluid.layers.data(name="data", shape=[-1,10], dtype='float64')
-        target_tensor = fluid.layers.data(
-          name="target_tensor", shape=[-1,20], dtype='float64')
-        result = fluid.layers.expand_as(x=data, target_tensor=target_tensor)
-        use_cuda = False
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        x = np.random.rand(3,10)
-        y = np.random.rand(3,20)
-        output= exe.run(feed={"data":x,"target_tensor":y},fetch_list=[result.name])
-        print(output[0].shape)
-        #(3,20)
+            data = fluid.layers.data(name="data", shape=[-1,10], dtype='float64')
+            target_tensor = fluid.layers.data(
+              name="target_tensor", shape=[-1,20], dtype='float64')
+            result = fluid.layers.expand_as(x=data, target_tensor=target_tensor)
+            use_cuda = False
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            x = np.random.rand(3,10)
+            y = np.random.rand(3,20)
+            output= exe.run(feed={"data":x,"target_tensor":y},fetch_list=[result.name])
+            print(output[0].shape)
+            #(3,20)
 
     """
     if in_dygraph_mode():
@@ -10574,7 +10598,9 @@ def uniform_random_batch_size_like(input,
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
 
             # example 1:
             input = fluid.data(name="input", shape=[1, 3], dtype='float32')
@@ -10647,7 +10673,9 @@ def gaussian_random(shape,
     Examples:
        .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
 
             # example 1:
             # attr shape is a list which doesn't contain Tensor.
@@ -10675,7 +10703,8 @@ def gaussian_random(shape,
        
        .. code-block:: python
        
-           # declarative mode 
+           # declarative mode
+           # required: skiptest
            import numpy as np
            from paddle import fluid
    
@@ -10814,7 +10843,10 @@ def gaussian_random_batch_size_like(input,
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
+
             input = fluid.data(name="input", shape=[13, 11], dtype='float32')
 
             out = fluid.layers.gaussian_random_batch_size_like(
@@ -11420,7 +11452,9 @@ def size(input):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid.layers as layers
+            paddle.enable_static()
 
             input = layers.data(
                 name="input", shape=[3, 100], dtype="float32", append_batch_size=False)
@@ -12523,7 +12557,10 @@ def mean(x, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
+
             input = fluid.layers.data(
                 name='data', shape=[2, 3], dtype='float32')
             mean = fluid.layers.mean(input)
@@ -15193,7 +15230,9 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
 
             # example 1:
             # attr shape is a list which doesn't contain Tensor.
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index ec20712cac760..c3d8fbfa55307 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -2528,18 +2528,21 @@ def lstm(input,
     Examples:
         .. code-block:: python
             
+            import paddle
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
+            paddle.enable_static()
 
             emb_dim = 256
             vocab_size = 10000
             data = fluid.data(name='x', shape=[None, 100], dtype='int64')
             emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
-            batch_size = 20
+            batch_size = 100
             dropout_prob = 0.2
             input_size = 100
             hidden_size = 150
             num_layers = 1
+            max_len = 12
             init_h = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
             init_c = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
             rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 3a0bbeb0fa341..abc84be12b38f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -322,6 +322,8 @@ def concat(input, axis=0, name=None):
         if isinstance(axis, Variable):
             axis = axis.numpy()
             axis = axis.item(0)
+        if not isinstance(input, Variable):
+            input = [t for t in input if t.shape.count(0) == 0]
         return _C_ops.concat(input, 'axis', axis)
 
     check_type(input, 'input', (list, tuple, Variable), 'concat')
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 709b36ed8e32b..24076e82b0365 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1305,6 +1305,7 @@ def __init__(self,
             grad_clip=grad_clip,
             name=name)
         self.type = "sgd"
+        self._use_mkldnn = False
 
     @no_grad
     def _append_optimize_op(self, block, param_and_grad):
@@ -1323,6 +1324,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 "Grad": param_and_grad[1],
                 "LearningRate": lr
             },
+            attrs={"use_mkldnn": self._use_mkldnn},
             outputs={"ParamOut": param_and_grad[0]},
             stop_gradient=True)
 
@@ -1433,12 +1435,12 @@ def _append_optimize_op(self, block, param_and_grad):
         velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                              param_and_grad[0])
         lr = self._create_param_lr(param_and_grad)
-
+        master_weight = None
         if framework.in_dygraph_mode():
-            _, _ = _C_ops.momentum(param_and_grad[0], param_and_grad[1],
-                                   velocity_acc, lr, param_and_grad[0],
-                                   velocity_acc, 'mu', self._momentum,
-                                   'use_nesterov', self._use_nesterov)
+            _, _, _ = _C_ops.momentum(
+                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
+                master_weight, param_and_grad[0], velocity_acc, master_weight,
+                'mu', self._momentum, 'use_nesterov', self._use_nesterov)
             return None
 
         attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
@@ -1982,26 +1984,29 @@ def __init__(self,
         self._master_weights = {}
 
     def _create_master_weight(self, param):
-        assert isinstance(self.helper, LayerHelper)
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
+        else:
+            assert isinstance(self.helper, LayerHelper)
 
-        var_name = param.name + '_fp32_master'
-        var_name = unique_name.generate(var_name)
-        var = layers.create_global_var(
-            name=var_name,
-            shape=param.shape,
-            value=0,
-            dtype='float32',
-            persistable=True)
-        block = self.helper.startup_program.global_block()
-        block.append_op(
-            type="cast",
-            inputs={"X": [param]},
-            outputs={"Out": [var]},
-            attrs={
-                "in_dtype": param.dtype,
-                "out_dtype": core.VarDesc.VarType.FP32
-            })
-        self._master_weights[param.name] = var
+            var_name = param.name + '_fp32_master'
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
         return var
 
     def _get_accumulator(self, name, param):
@@ -2462,12 +2467,14 @@ def _append_optimize_op(self, block, param_and_grad):
                 self._beta1, Variable) else self._beta1.numpy().item(0)
             _beta2 = self._beta2 if not isinstance(
                 self._beta2, Variable) else self._beta2.numpy().item(0)
-            _, _, _, _, _ = _C_ops.adam(
+            master_weight = None
+            _, _, _, _, _, _ = _C_ops.adam(
                 param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
-                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
-                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
-                1000, 'beta1', _beta1, 'beta2', _beta2, 'use_global_beta_pow',
+                beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
+                moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
+                'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
+                'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
+                'beta2', _beta2, 'use_global_beta_pow',
                 self._use_global_beta_pow)
 
             return None
@@ -3959,62 +3966,59 @@ class ExponentialMovingAverage(object):
 
 
     Args:
-	decay (float, optional): The exponential decay rate, usually close to 1, such as 
-            0.999, 0.9999, ... . Default 0.999.
-        thres_steps (Variable|None): If not `None`, schedule the decay rate. 
-            Default None.
-        name (str|None): For detailed information, please refer to 
-            :ref:`api_guide_Name`. Usually name is no need to set and None by 
-            default.
+        decay (float, optional): The exponential decay rate, usually close to 1, such as 0.999, 0.9999, ... . Default 0.999.
+        thres_steps (Variable|None, optional): If not `None`, schedule the decay rate. Default None.
+        name (str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
 
     Examples:
 
-	.. code-block:: python
-
-	    import numpy
-	    import paddle
-	    import paddle.fluid as fluid
-
-	    data = fluid.data(name='x', shape=[-1, 5], dtype='float32')
-	    hidden = fluid.layers.fc(input=data, size=10)
-	    cost = fluid.layers.mean(hidden)
-
-	    test_program = fluid.default_main_program().clone(for_test=True)
-
-	    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-	    optimizer.minimize(cost)
-
-	    global_steps = fluid.layers.autoincreased_step_counter()
-	    ema = fluid.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps)
-	    ema.update()
-
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    for pass_id in range(3):
-		for batch_id in range(6):
-		    data = numpy.random.random(size=(10, 5)).astype('float32')
-		    exe.run(program=fluid.default_main_program(),
-			feed={'x': data}, 
-			fetch_list=[cost.name])
-
-		# usage 1
-		with ema.apply(exe):
-		    data = numpy.random.random(size=(10, 5)).astype('float32')
-		    exe.run(program=test_program,
-			    feed={'x': data}, 
-			    fetch_list=[hidden.name])
-			    
-
-		 # usage 2
-		with ema.apply(exe, need_restore=False):
-		    data = numpy.random.random(size=(10, 5)).astype('float32')
-		    exe.run(program=test_program,
-			    feed={'x': data}, 
-			    fetch_list=[hidden.name])
-		ema.restore(exe)
+        .. code-block:: python
+
+            import numpy
+            import paddle
+            import paddle.static as static
+            from paddle.static import ExponentialMovingAverage
+
+            paddle.enable_static()
+
+            data = static.data(name='x', shape=[-1, 5], dtype='float32')
+            hidden = static.nn.fc(x=data, size=10)
+            cost = paddle.mean(hidden)
+
+            test_program = static.default_main_program().clone(for_test=True)
+            optimizer = paddle.optimizer.Adam(learning_rate=0.001)
+            optimizer.minimize(cost)
+
+            ema = ExponentialMovingAverage(0.999)
+            ema.update()
+
+            place = paddle.CPUPlace()
+            exe = static.Executor(place)
+            exe.run(static.default_startup_program())
+
+            for pass_id in range(3):
+                for batch_id in range(6):
+                    data = numpy.random.random(size=(10, 5)).astype('float32')
+                    exe.run(program=static.default_main_program(),
+                    feed={'x': data}, 
+                    fetch_list=[cost.name])
+
+                # usage 1
+                with ema.apply(exe):
+                    data = numpy.random.random(size=(10, 5)).astype('float32')
+                    exe.run(program=test_program,
+                        feed={'x': data}, 
+                        fetch_list=[hidden.name])
+
+                # usage 2
+                with ema.apply(exe, need_restore=False):
+                    data = numpy.random.random(size=(10, 5)).astype('float32')
+                    exe.run(program=test_program,
+                        feed={'x': data}, 
+                        fetch_list=[hidden.name])
+                ema.restore(exe)
+
     """
 
     def __init__(self, decay=0.999, thres_steps=None, name=None):
@@ -4379,6 +4383,18 @@ def _create_vars(self, block, ori_block):
                         name=var,
                         type=core.VarDesc.VarType.READER,
                         persistable=source_var.persistable)
+                elif isinstance(source_var, Parameter):
+                    dest_var = block.create_parameter(
+                        name=source_var.name,
+                        shape=source_var.shape,
+                        dtype=source_var.dtype,
+                        type=source_var.type,
+                        lod_level=source_var.lod_level,
+                        stop_gradient=source_var.stop_gradient,
+                        trainable=source_var.trainable,
+                        optimize_attr=source_var.optimize_attr,
+                        regularizer=source_var.regularizer,
+                        error_clip=source_var.error_clip)
                 else:
                     dest_var = block._clone_variable(source_var, False)
                 self._clone_var_attr(dest_var, source_var)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index bd0c666968806..4b887da838257 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -125,6 +125,9 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler)
 endif()
 
+# Temporally disable test_deprecated_decorator
+LIST(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
+
 if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
     LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
@@ -700,6 +703,10 @@ add_subdirectory(sequence)
 add_subdirectory(dygraph_to_static)
 add_subdirectory(rnn)
 
+if (NOT WIN32 OR NOT WITH_GPU)
+    add_subdirectory(fft)
+endif()
+
 if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
@@ -960,6 +967,7 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_solve_op PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
@@ -1025,8 +1033,9 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_collective_sendrecv PROPERTIES TIMEOUT 120)
 endif()
 if(WITH_GPU OR WITH_ROCM)
-    set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT 300)
     set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT 120)
     set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
+set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
diff --git a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt b/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
new file mode 100644
index 0000000000000..f71e04c09aa38
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/fft/__init__.py b/python/paddle/fluid/tests/unittests/fft/__init__.py
new file mode 100644
index 0000000000000..b9a7651e44909
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fft/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
new file mode 100644
index 0000000000000..b00111f6821ae
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from functools import partial
+from numpy import asarray
+from numpy.fft._pocketfft import _raw_fft, _raw_fftnd, _get_forward_norm, _get_backward_norm, _cook_nd_args
+
+
+def _fftc2c(a, n=None, axis=-1, norm=None, forward=None):
+    a = asarray(a)
+    if n is None:
+        n = a.shape[axis]
+    if forward:
+        inv_norm = _get_forward_norm(n, norm)
+    else:
+        inv_norm = _get_backward_norm(n, norm)
+    output = _raw_fft(a, n, axis, False, forward, inv_norm)
+    return output
+
+
+def _fftr2c(a, n=None, axis=-1, norm=None, forward=None):
+    a = asarray(a)
+    if n is None:
+        n = a.shape[axis]
+    if forward:
+        inv_norm = _get_forward_norm(n, norm)
+    else:
+        inv_norm = _get_backward_norm(n, norm)
+    output = _raw_fft(a, n, axis, True, True, inv_norm)
+    if not forward:
+        output = output.conj()
+    return output
+
+
+def _fftc2r(a, n=None, axis=-1, norm=None, forward=None):
+    a = asarray(a)
+    if n is None:
+        n = (a.shape[axis] - 1) * 2
+    if forward:
+        inv_norm = _get_forward_norm(n, norm)
+    else:
+        inv_norm = _get_backward_norm(n, norm)
+    output = _raw_fft(a.conj()
+                      if forward else a, n, axis, True, False, inv_norm)
+    return output
+
+
+def fft_c2c(x, axes, normalization, forward):
+    f = partial(_fftc2c, forward=forward)
+    y = _raw_fftnd(x, s=None, axes=axes, function=f, norm=normalization)
+    return y
+
+
+def fft_c2c_backward(dy, axes, normalization, forward):
+    f = partial(_fftc2c, forward=forward)
+    dx = _raw_fftnd(dy, s=None, axes=axes, function=f, norm=normalization)
+    return dx
+
+
+def fft_r2c(x, axes, normalization, forward, onesided):
+    a = asarray(x)
+    s, axes = _cook_nd_args(a, axes=axes)
+    if onesided:
+        a = _fftr2c(a, s[-1], axes[-1], normalization, forward)
+        for ii in range(len(axes) - 1):
+            a = _fftc2c(a, s[ii], axes[ii], normalization, forward)
+    else:
+        a = fft_c2c(x, axes, normalization, forward)
+    return a
+
+
+def fft_r2c_backward(dy, x, axes, normalization, forward, onesided):
+    a = dy
+    if not onesided:
+        a = fft_c2c_backward(a, axes, normalization, forward).real
+    else:
+        pad_widths = [(0, 0)] * a.ndim
+        last_axis = axes[-1]
+        if last_axis < 0:
+            last_axis += a.ndim
+        last_dim_size = a.shape[last_axis]
+        pad_widths[last_axis] = (0, x.shape[last_axis] - last_dim_size)
+        a = np.pad(a, pad_width=pad_widths)
+        a = fft_c2c_backward(a, axes, normalization, forward).real
+    return a
+
+
+def fft_c2r(x, axes, normalization, forward, last_dim_size):
+    a = asarray(x)
+    s, axes = _cook_nd_args(a, axes=axes, invreal=1)
+    if last_dim_size is not None:
+        s[-1] = last_dim_size
+    for ii in range(len(axes) - 1):
+        a = _fftc2c(a, s[ii], axes[ii], normalization, forward)
+    a = _fftc2r(a, s[-1], axes[-1], normalization, forward)
+    return a
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py
new file mode 100644
index 0000000000000..c83c943217d4e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py
@@ -0,0 +1,1050 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import re
+import sys
+import unittest
+
+import numpy as np
+import paddle
+import scipy.fft
+
+DEVICES = [paddle.CPUPlace()]
+if paddle.is_compiled_with_cuda():
+    DEVICES.append(paddle.CUDAPlace(0))
+
+TEST_CASE_NAME = 'suffix'
+# All test case will use float64 for compare percision, refs:
+# https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64
+RTOL = {
+    'float32': 1e-03,
+    'complex64': 1e-3,
+    'float64': 1e-7,
+    'complex128': 1e-7
+}
+ATOL = {'float32': 0.0, 'complex64': 0, 'float64': 0.0, 'complex128': 0}
+
+
+def rand_x(dims=1,
+           dtype='float64',
+           min_dim_len=1,
+           max_dim_len=10,
+           complex=False):
+    shape = [np.random.randint(min_dim_len, max_dim_len) for i in range(dims)]
+    if complex:
+        return np.random.randn(*shape).astype(dtype) + 1.j * np.random.randn(
+            *shape).astype(dtype)
+    else:
+        return np.random.randn(*shape).astype(dtype)
+
+
+def place(devices, key='place'):
+    def decorate(cls):
+        module = sys.modules[cls.__module__].__dict__
+        raw_classes = {
+            k: v
+            for k, v in module.items() if k.startswith(cls.__name__)
+        }
+
+        for raw_name, raw_cls in raw_classes.items():
+            for d in devices:
+                test_cls = dict(raw_cls.__dict__)
+                test_cls.update({key: d})
+                new_name = raw_name + '.' + d.__class__.__name__
+                module[new_name] = type(new_name, (raw_cls, ), test_cls)
+            del module[raw_name]
+        return cls
+
+    return decorate
+
+
+def parameterize(fields, values=None):
+
+    fields = [fields] if isinstance(fields, str) else fields
+    params = [dict(zip(fields, vals)) for vals in values]
+
+    def decorate(cls):
+        test_cls_module = sys.modules[cls.__module__].__dict__
+        for k, v in enumerate(params):
+            test_cls = dict(cls.__dict__)
+            test_cls.update(v)
+            name = cls.__name__ + str(k)
+            name = name + '.' + v.get('suffix') if v.get('suffix') else name
+
+            test_cls_module[name] = type(name, (cls, ), test_cls)
+
+        for m in list(cls.__dict__):
+            if m.startswith("test"):
+                delattr(cls, m)
+        return cls
+
+    return decorate
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+     ('test_x_complex', rand_x(
+         5, complex=True), None, -1,
+      'backward'), ('test_n_grater_input_length', rand_x(
+          5, max_dim_len=5), 11, -1,
+                    'backward'), ('test_n_smaller_than_input_length', rand_x(
+                        5, min_dim_len=5, complex=True), 3, -1, 'backward'),
+     ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+class TestFft(unittest.TestCase):
+    def test_fft(self):
+        """Test fft with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertTrue(
+                np.allclose(
+                    scipy.fft.fft(self.x, self.n, self.axis, self.norm),
+                    paddle.fft.fft(
+                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                    rtol=RTOL.get(str(self.x.dtype)),
+                    atol=ATOL.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+    ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+    ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)
+])
+class TestFftException(unittest.TestCase):
+    def test_fft(self):
+        """Test fft with buoudary condition
+        Test case include:
+        - n out of range
+        - axis out of range
+        - axis type error
+        - norm out of range
+        """
+        with self.assertRaises(self.expect_exception):
+            paddle.fft.fft(
+                paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+        ('test_x_complex128', rand_x(
+            5, complex=True), None, (0, 1), 'backward'),
+        ('test_n_grater_input_length', rand_x(
+            5, max_dim_len=5), (6, 6), (0, 1), 'backward'),
+        ('test_n_smaller_than_input_length', rand_x(
+            5, min_dim_len=5, complex=True), (4, 4), (0, 1), 'backward'),
+        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+        ('test_axis_none', rand_x(5), None, None, 'backward'),
+        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+    ])
+class TestFft2(unittest.TestCase):
+    def test_fft2(self):
+        """Test fft2 with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertTrue(
+                np.allclose(
+                    scipy.fft.fft2(self.x, self.n, self.axis, self.norm),
+                    paddle.fft.fft2(
+                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                    rtol=RTOL.get(str(self.x.dtype)),
+                    atol=ATOL.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_x_complex_input', rand_x(
+        2, complex=True), None, (0, 1), None,
+      ValueError), ('test_x_1dim_tensor', rand_x(1), None, (0, 1), None,
+                    ValueError), ('test_n_nagative', rand_x(2), -1, (0, 1),
+                                  'backward', ValueError),
+     ('test_n_len_not_equal_axis', rand_x(
+         5, max_dim_len=5), 11, (0, 1), 'backward',
+      ValueError), ('test_n_zero', rand_x(2), (0, 0), (0, 1), 'backward',
+                    ValueError), ('test_axis_out_of_range', rand_x(2), None,
+                                  (0, 1, 2), 'backward', ValueError),
+     ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+     ('test_axis_not_sequence', rand_x(5), None, -10, 'backward', ValueError),
+     ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)])
+class TestFft2Exception(unittest.TestCase):
+    def test_fft2(self):
+        """Test fft2 with buoudary condition
+        Test case include:
+        - input type error
+        - input dim error
+        - n out of range
+        - axis out of range
+        - axis type error
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.fft2(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+     ('test_x_complex128', rand_x(
+         5, complex=True), None, None,
+      'backward'), ('test_n_grater_input_length', rand_x(
+          5, max_dim_len=5), (6, 6), (1, 2), 'backward'), (
+              'test_n_smaller_input_length', rand_x(
+                  5, min_dim_len=5, complex=True), (3, 3), (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2),
+      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
+class TestFftn(unittest.TestCase):
+    def test_fftn(self):
+        """Test fftn with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.fftn(self.x, self.n, self.axis, self.norm),
+                paddle.fft.fftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, -1, "backward"),
+    ('test_n_grater_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 4, -1,
+     "backward"),
+    ('test_n_smaller_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 2, -1,
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, 1,
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, 1,
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
+     "ortho"),
+])
+class TestHfft(unittest.TestCase):
+    def test_hfft(self):
+        """Test hfft with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.hfft(self.x, self.n, self.axis, self.norm),
+                paddle.fft.hfft(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, -1, "backward"),
+    ('test_n_grater_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 4, -1,
+     "backward"),
+    ('test_n_smaller_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 2, -1,
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
+     "ortho"),
+])
+class TestIrfft(unittest.TestCase):
+    def test_irfft(self):
+        """Test irfft with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.irfft(self.x, self.n, self.axis, self.norm),
+                paddle.fft.irfft(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, None, "backward"),
+    ('test_n_grater_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4], None,
+     "backward"),
+    ('test_n_smaller_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2], None,
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "ortho"),
+])
+class TestIrfftn(unittest.TestCase):
+    def test_irfftn(self):
+        """Test irfftn with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.irfftn(self.x, self.n, self.axis, self.norm),
+                paddle.fft.irfftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, None, "backward"),
+    ('test_n_grater_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4], None,
+     "backward"),
+    ('test_n_smaller_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2], None,
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "ortho"),
+])
+class TestHfftn(unittest.TestCase):
+    def test_hfftn(self):
+        """Test hfftn with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.hfftn(self.x, self.n, self.axis, self.norm),
+                paddle.fft.hfftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 's', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, (-2, -1), "backward"),
+    ('test_with_s', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+     [2, 2], (-2, -1), "backward", ValueError),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "ortho"),
+])
+class TestHfft2(unittest.TestCase):
+    def test_hfft2(self):
+        """Test hfft2 with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.hfft2(self.x, self.s, self.axis, self.norm),
+                paddle.fft.hfft2(
+                    paddle.to_tensor(self.x), self.s, self.axis, self.norm),
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 's', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, (-2, -1), "backward"),
+    ('test_n_equal_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (4, 6), (-2, -1),
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "ortho"),
+])
+class TestIrfft2(unittest.TestCase):
+    def test_irfft2(self):
+        """Test irfft2 with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.irfft2(self.x, self.s, self.axis, self.norm),
+                paddle.fft.irfft2(
+                    paddle.to_tensor(self.x), self.s, self.axis, self.norm),
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [(
+    'test_bool_input',
+    (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(np.bool8),
+    None, -1, 'backward', NotImplementedError), (
+        'test_n_nagative',
+        np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1,
+        'backward', ValueError), (
+            'test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
+            0, -1, 'backward', ValueError), (
+                'test_n_type',
+                np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+                (1, 2, 3), -1, 'backward', ValueError), (
+                    'test_axis_out_of_range',
+                    np.random.randn(4) + 1j * np.random.randn(4), None, 10,
+                    'backward', ValueError), (
+                        'test_axis_with_array',
+                        np.random.randn(4) + 1j * np.random.randn(4), None,
+                        (0, 1), 'backward', ValueError), (
+                            'test_norm_not_in_enum_value',
+                            np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
+                            None, -1, 'random', ValueError)])
+class TestHfftException(unittest.TestCase):
+    def test_hfft(self):
+        """Test hfft with buoudary condition
+        Test case include:
+        Test case include:
+        - n out of range
+        - n type error
+        - axis out of range
+        - axis type error
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.hfft(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1,
+      'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
+      'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (1, 2), -1, 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, 10, 'backward', ValueError),
+     ('test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4),
+      None, (0, 1), 'backward',
+      ValueError), ('test_norm_not_in_enum_value',
+                    np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None,
+                    None, 'random', ValueError)])
+class TestIrfftException(unittest.TestCase):
+    def test_irfft(self):
+        """
+        Test irfft with buoudary condition
+        Test case include:
+        - n out of range
+        - n type error
+        - axis type error
+        - axis out of range
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.irfft(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_bool_input',
+      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+       ).astype(np.bool8), None, (-2, -1), 'backward', NotImplementedError),
+     ('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
+      (-2, -1), 'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (0, 0), (-2, -1), 'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      3, None, 'backward', ValueError),
+     ('test_n_axis_dim',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2), (-1),
+      'backward', ValueError), ('test_axis_out_of_range',
+                                np.random.randn(4) + 1j * np.random.randn(4),
+                                None, (1, 2), 'backward', ValueError),
+     ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, -1,
+      'backward',
+      ValueError), ('test_norm_not_in_enum_value',
+                    np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None,
+                    None, 'random', ValueError)])
+class TestHfft2Exception(unittest.TestCase):
+    def test_hfft2(self):
+        """
+        Test hfft2 with buoudary condition
+        Test case include:
+        - input type error
+        - n type error
+        - n out of range
+        - axis out of range
+        - the dimensions of n and axis are different
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.hfft2(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
+      (-2, -1), 'backward', ValueError),
+     ('test_zero_point',
+      np.random.randn(4, 4, 1) + 1j * np.random.randn(4, 4, 1), None, (-2, -1),
+      "backward", ValueError),
+     ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (0, 0), (-2, -1), 'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      3, -1, 'backward',
+      ValueError), ('test_n_axis_dim',
+                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+                    (1, 2), (-3, -2, -1), 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, (1, 2), 'backward', ValueError), (
+          'test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
+          1, 'backward',
+          ValueError), ('test_norm_not_in_enum_value',
+                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
+                        None, None, 'random', ValueError)])
+class TestIrfft2Exception(unittest.TestCase):
+    def test_irfft2(self):
+        """
+        Test irfft2 with buoudary condition
+        Test case include:
+        - input type error
+        - n type error
+        - n out of range
+        - axis out of range
+        - the dimensions of n and axis are different
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.irfft2(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_bool_input',
+      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+       ).astype(np.bool8), None, (-2, -1), 'backward', NotImplementedError),
+     ('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
+      (-2, -1), 'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (0, 0), (-2, -1), 'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      3, -1, 'backward', ValueError),
+     ('test_n_axis_dim',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (1, 2), (-3, -2, -1), 'backward',
+      ValueError), ('test_axis_out_of_range',
+                    np.random.randn(4) + 1j * np.random.randn(4), None,
+                    (10, 20), 'backward', ValueError),
+     ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, 1,
+      'backward',
+      ValueError), ('test_norm_not_in_enum_value',
+                    np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None,
+                    None, 'random', ValueError)])
+class TestHfftnException(unittest.TestCase):
+    def test_hfftn(self):
+        """Test hfftn with buoudary condition
+        Test case include:
+        - input type error
+        - n type error
+        - n out of range
+        - axis out of range
+        - the dimensions of n and axis are different
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.hfftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
+      (-2, -1), 'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (0, 0), (-2, -1), 'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      3, -1, 'backward',
+      ValueError), ('test_n_axis_dim',
+                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+                    (1, 2), (-3, -2, -1), 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, (10, 20), 'backward', ValueError),
+     ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, 1,
+      'backward',
+      ValueError), ('test_norm_not_in_enum_value',
+                    np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None,
+                    None, 'random', ValueError)])
+class TestIrfftnException(unittest.TestCase):
+    def test_irfftn(self):
+        """Test irfftn with buoudary condition
+        Test case include:
+        - n out of range
+        - n type error
+        - axis out of range
+        - norm out of range
+        - the dimensions of n and axis are different
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.irfftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'), (
+        'test_n_grater_than_input_length', rand_x(
+            5, max_dim_len=5), 11, -1, 'backward'),
+     ('test_n_smaller_than_input_length', rand_x(
+         5, min_dim_len=5), 3, -1,
+      'backward'), ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+class TestRfft(unittest.TestCase):
+    def test_rfft(self):
+        """Test rfft with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertTrue(
+                np.allclose(
+                    scipy.fft.rfft(self.x, self.n, self.axis, self.norm),
+                    paddle.fft.rfft(
+                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                    rtol=RTOL.get(str(self.x.dtype)),
+                    atol=ATOL.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+    ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+    ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)
+])
+class TestRfftException(unittest.TestCase):
+    def test_rfft(self):
+        """Test rfft with buoudary condition
+        Test case include:
+        - n out of range
+        - axis out of range
+        - axis type error
+        - norm out of range
+        - the dimensions of n and axis are different
+        """
+        with self.assertRaises(self.expect_exception):
+            paddle.fft.rfft(
+                paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+        ('test_n_grater_input_length', rand_x(
+            5, max_dim_len=5), (6, 6), (0, 1), 'backward'),
+        ('test_n_smaller_than_input_length', rand_x(
+            5, min_dim_len=5), (4, 4), (0, 1), 'backward'),
+        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+        ('test_axis_none', rand_x(5), None, None, 'backward'),
+        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+    ])
+class TestRfft2(unittest.TestCase):
+    def test_rfft2(self):
+        """Test rfft2 with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertTrue(
+                np.allclose(
+                    scipy.fft.rfft2(self.x, self.n, self.axis, self.norm),
+                    paddle.fft.rfft2(
+                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                    rtol=RTOL.get(str(self.x.dtype)),
+                    atol=ATOL.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+        ('test_x_complex_input', rand_x(
+            2, complex=True), None, (0, 1), 'backward', RuntimeError),
+        ('test_x_1dim_tensor', rand_x(1), None, (0, 1), 'backward', ValueError),
+        ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
+        ('test_n_zero', rand_x(2), 0, (0, 1), 'backward', ValueError),
+        ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
+         ValueError),
+        ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
+         ValueError),
+        ('test_axis_not_sequence', rand_x(5), None, -10, 'backward',
+         ValueError),
+        ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError),
+    ])
+class TestRfft2Exception(unittest.TestCase):
+    def test_rfft2(self):
+        """Test rfft2 with buoudary condition
+        Test case include:
+        - input type error
+        - input dim error
+        - n out of range
+        - axis out of range
+        - norm out of range
+        - the dimensions of n and axis are different
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.rfft2(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+        ('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+        ('test_n_grater_input_length', rand_x(
+            5, max_dim_len=5), (6, 6), (1, 2), 'backward'),
+        ('test_n_smaller_input_length', rand_x(
+            5, min_dim_len=5), (3, 3), (1, 2), 'backward'),
+        ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
+        ('test_norm_forward', rand_x(5), None, None, 'forward'),
+        ('test_norm_ortho', rand_x(5), None, None, 'ortho'),
+    ])
+class TestRfftn(unittest.TestCase):
+    def test_rfftn(self):
+        """Test rfftn with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertTrue(
+                np.allclose(
+                    scipy.fft.rfftn(self.x, self.n, self.axis, self.norm),
+                    paddle.fft.rfftn(
+                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                    rtol=RTOL.get(str(self.x.dtype)),
+                    atol=ATOL.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_x_complex', rand_x(
+        4, complex=True), None, None, 'backward',
+      RuntimeError), ('test_n_nagative', rand_x(4), (-1, -1), (1, 2),
+                      'backward', ValueError),
+     ('test_n_not_sequence', rand_x(4), -1, None, 'backward', ValueError),
+     ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError), (
+         'test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
+         ValueError),
+     ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)])
+class TestRfftnException(unittest.TestCase):
+    def test_rfftn(self):
+        """Test rfftn with buoudary condition
+        Test case include:
+        - n out of range
+        - axis out of range
+        - norm out of range
+        - the dimensions of n and axis are different
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.rfftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'), (
+        'test_n_grater_than_input_length', rand_x(
+            5, max_dim_len=5), 11, -1, 'backward'),
+     ('test_n_smaller_than_input_length', rand_x(
+         5, min_dim_len=5), 3, -1,
+      'backward'), ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+class TestIhfft(unittest.TestCase):
+    def test_ihfft(self):
+        """Test ihfft with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.ihfft(self.x, self.n, self.axis, self.norm),
+                paddle.fft.ihfft(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+    ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+    ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)
+])
+class TestIhfftException(unittest.TestCase):
+    def test_ihfft(self):
+        """Test ihfft with buoudary condition
+        Test case include:
+        - axis type error
+        - axis out of range
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.ihfft(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+        ('test_n_grater_input_length', rand_x(
+            5, max_dim_len=5), (11, 11), (0, 1), 'backward'),
+        ('test_n_smaller_than_input_length', rand_x(
+            5, min_dim_len=5), (1, 1), (0, 1), 'backward'),
+        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+        ('test_axis_none', rand_x(5), None, None, 'backward'),
+        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+    ])
+class TestIhfft2(unittest.TestCase):
+    def test_ihfft2(self):
+        """Test ihfft2 with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.ihfft2(self.x, self.n, self.axis, self.norm),
+                paddle.fft.ihfft2(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_x_complex_input', rand_x(
+        2, complex=True), None, (0, 1), None, ValueError),
+     ('test_x_1dim_tensor', rand_x(1), None, (0, 1), None,
+      ValueError), ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward',
+                    ValueError), ('test_n_len_not_equal_axis', rand_x(
+                        5, max_dim_len=5), 11, (0, 1), 'backward', ValueError),
+     ('test_n_zero', rand_x(2), (0, 0), (0, 1), 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
+      ValueError), ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
+                    ValueError), ('test_axis_not_sequence', rand_x(5), None,
+                                  -10, 'backward', ValueError),
+     ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)])
+class TestIhfft2Exception(unittest.TestCase):
+    def test_ihfft2(self):
+        """Test ihfft2 with buoudary condition
+        Test case include:
+        - input type error
+        - input dim error
+        - n out of range
+        - axis type error
+        - axis out of range
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.ihfft2(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+     ('test_n_grater_input_length', rand_x(
+         5, max_dim_len=5), (11, 11), (0, 1),
+      'backward'), ('test_n_smaller_input_length', rand_x(
+          5, min_dim_len=5), (1, 1), (0, 1), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2),
+      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
+class TestIhfftn(unittest.TestCase):
+    def test_ihfftn(self):
+        """Test ihfftn with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertTrue(
+                np.allclose(
+                    scipy.fft.ihfftn(self.x, self.n, self.axis, self.norm),
+                    paddle.fft.ihfftn(
+                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                    rtol=RTOL.get(str(self.x.dtype)),
+                    atol=ATOL.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_x_complex', rand_x(
+        4, complex=True), None, None, 'backward', RuntimeError),
+     ('test_n_nagative', rand_x(4), -1, None, 'backward', ValueError),
+     ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError), (
+         'test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
+         ValueError),
+     ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)])
+class TestIhfftnException(unittest.TestCase):
+    def test_ihfftn(self):
+        """Test ihfftn with buoudary condition
+        Test case include:
+        - input type error
+        - n out of range
+        - axis out of range
+        - norm out of range
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            with self.assertRaises(self.expect_exception):
+                paddle.fft.ihfftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'n', 'd', 'dtype'), [
+    ('test_without_d', 20, 1, 'float32'),
+    ('test_with_d', 20, 0.5, 'float32'),
+])
+class TestFftFreq(unittest.TestCase):
+    def test_fftfreq(self):
+        """Test fftfreq with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.fftfreq(self.n, self.d).astype(self.dtype),
+                paddle.fft.fftfreq(self.n, self.d, self.dtype).numpy(),
+                rtol=RTOL.get(str(self.dtype)),
+                atol=ATOL.get(str(self.dtype)))
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'n', 'd', 'dtype'), [
+    ('test_without_d', 20, 1, 'float32'),
+    ('test_with_d', 20, 0.5, 'float32'),
+])
+class TestRfftFreq(unittest.TestCase):
+    def test_rfftfreq(self):
+        """Test rfftfreq with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.rfftfreq(self.n, self.d).astype(self.dtype),
+                paddle.fft.rfftfreq(self.n, self.d, self.dtype).numpy(),
+                rtol=RTOL.get(str(self.dtype)),
+                atol=ATOL.get(str(self.dtype)))
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'axes', 'dtype'), [
+    ('test_1d', np.random.randn(10), (0, ), 'float64'),
+    ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+])
+class TestFftShift(unittest.TestCase):
+    def test_fftshift(self):
+        """Test fftshift with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.fftshift(self.x, self.axes),
+                paddle.fft.fftshift(paddle.to_tensor(self.x),
+                                    self.axes).numpy(),
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'axes'), [
+    ('test_1d', np.random.randn(10), (0, ), 'float64'),
+    ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+])
+class TestIfftShift(unittest.TestCase):
+    def test_ifftshift(self):
+        """Test ifftshift with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.ifftshift(self.x, self.axes),
+                paddle.fft.ifftshift(paddle.to_tensor(self.x),
+                                     self.axes).numpy(),
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+# yapf: enable
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
new file mode 100644
index 0000000000000..ac9d1557b53e9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
@@ -0,0 +1,894 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import re
+import sys
+import unittest
+
+import numpy as np
+import paddle
+import scipy.fft
+
+from test_fft import (ATOL, DEVICES, RTOL, TEST_CASE_NAME, parameterize, place,
+                      rand_x)
+
+
+@contextlib.contextmanager
+def stgraph(func, place, x, n, axes, norm):
+    """static graph exec context"""
+    paddle.enable_static()
+    mp, sp = paddle.static.Program(), paddle.static.Program()
+    with paddle.static.program_guard(mp, sp):
+        input = paddle.static.data('input', x.shape, dtype=x.dtype)
+        output = func(input, n, axes, norm)
+
+    exe = paddle.static.Executor(place)
+    exe.run(sp)
+    [output] = exe.run(mp, feed={'input': x}, fetch_list=[output])
+    yield output
+    paddle.disable_static()
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+     ('test_x_complex64', rand_x(
+         5, np.float64, complex=True), None, -1,
+      'backward'), ('test_n_grater_than_input_length', rand_x(
+          5, max_dim_len=5), 11, -1,
+                    'backward'), ('test_n_smaller_than_input_length', rand_x(
+                        5, min_dim_len=5), 3, -1, 'backward'),
+     ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+class TestFft(unittest.TestCase):
+    def test_static_rfft(self):
+        with stgraph(paddle.fft.fft, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.fft(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+     ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward',
+      ValueError), ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
+                    ValueError), ('test_norm_not_in_enum_value', rand_x(2),
+                                  None, -1, 'random', ValueError)])
+class TestFftException(unittest.TestCase):
+    def test_fft(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.fft, self.place, self.x, self.n, self.axis,
+                         self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+        ('test_x_complex128', rand_x(
+            5, complex=True), None, (0, 1), 'backward'),
+        ('test_n_grater_input_length', rand_x(
+            5, max_dim_len=5), (6, 6), (0, 1), 'backward'),
+        ('test_n_smaller_than_input_length', rand_x(
+            5, min_dim_len=5), (4, 4), (0, 1), 'backward'),
+        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+        ('test_axis_none', rand_x(5), None, None, 'backward'),
+        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+    ])
+class TestFft2(unittest.TestCase):
+    def test_static_fft2(self):
+        with stgraph(paddle.fft.fft2, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.fft2(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [
+        # ('test_x_not_tensor', [0, 1], None, (0, 1), 'backward', ValueError),
+        ('test_x_1dim_tensor', rand_x(1), None, (0, 1), 'backward', ValueError),
+        ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
+        ('test_n_zero', rand_x(2), 0, (0, 1), 'backward', ValueError),
+        ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
+         ValueError),
+        ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
+         ValueError),
+        ('test_axis_not_sequence', rand_x(5), None, -10, 'backward',
+         ValueError),
+        ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)
+    ])
+class TestFft2Exception(unittest.TestCase):
+    def test_static_fft2(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.fft2, self.place, self.x, self.n, self.axis,
+                         self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+     ('test_x_complex128', rand_x(
+         5, np.float64, complex=True), None, None,
+      'backward'), ('test_n_grater_input_length', rand_x(
+          5, max_dim_len=5), (6, 6), (1, 2),
+                    'backward'), ('test_n_smaller_input_length', rand_x(
+                        5, min_dim_len=5), (3, 3), (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2),
+      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
+class TestFftn(unittest.TestCase):
+    def test_static_fftn(self):
+        with stgraph(paddle.fft.fftn, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.fftn(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_x_complex', rand_x(
+        4, complex=True), None, None, 'backward',
+      TypeError), ('test_n_nagative', rand_x(4), (-1, -1), (1, 2), 'backward',
+                   ValueError), ('test_n_not_sequence', rand_x(4), -1, None,
+                                 'backward', ValueError),
+     ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
+      ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random',
+                    ValueError)])
+class TestRfftnException(unittest.TestCase):
+    def test_static_rfftn(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.rfftn, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, -1, "backward"),
+    ('test_n_grater_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 4, -1,
+     "backward"),
+    ('test_n_smaller_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 2, -1,
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, 1,
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, 1,
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
+     "ortho"),
+])
+class TestHfft(unittest.TestCase):
+    """Test hfft with norm condition
+    """
+
+    def test_hfft(self):
+        with stgraph(paddle.fft.hfft, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.hfft(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, -1, "backward"),
+    ('test_n_grater_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 4, -1,
+     "backward"),
+    ('test_n_smaller_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 2, -1,
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
+     "ortho"),
+])
+class TestIrfft(unittest.TestCase):
+    """Test irfft with norm condition
+    """
+
+    def test_irfft(self):
+        with stgraph(paddle.fft.irfft, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.irfft(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, None, "backward"),
+    ('test_n_grater_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4], None,
+     "backward"),
+    ('test_n_smaller_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2], None,
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "ortho"),
+])
+class Testirfftn(unittest.TestCase):
+    """Test irfftn with norm condition
+    """
+
+    def test_static_irfftn(self):
+        with stgraph(paddle.fft.irfftn, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.irfftn(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, None, "backward"),
+    ('test_n_grater_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4], None,
+     "backward"),
+    ('test_n_smaller_than_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2], None,
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
+     "ortho"),
+])
+class Testhfftn(unittest.TestCase):
+    """Test hfftn with norm condition
+    """
+
+    def test_static_hfftn(self):
+        with stgraph(paddle.fft.hfftn, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.hfftn(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 's', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, (-2, -1), "backward"),
+    ('test_n_grater_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4, 8], (-2, -1),
+     "backward"),
+    ('test_n_smaller_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2, 4], (-2, -1),
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "ortho"),
+])
+class Testhfft2(unittest.TestCase):
+    """Test hfft2 with norm condition
+    """
+
+    def test_static_hfft2(self):
+        with stgraph(paddle.fft.hfft2, self.place, self.x, self.s, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.hfft2(self.x, self.s, self.axis, self.norm),
+                y,
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 's', 'axis', 'norm'), [
+    ('test_x_complex128',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+      ).astype(np.complex128), None, (-2, -1), "backward"),
+    ('test_n_equal_input_length',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (2, 4), (-2, -1),
+     "backward"),
+    ('test_axis_not_last',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "backward"),
+    ('test_norm_forward',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "forward"),
+    ('test_norm_ortho',
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
+     "ortho"),
+])
+class TestIrfft2(unittest.TestCase):
+    """Test irfft2 with norm condition
+    """
+
+    def test_static_irfft2(self):
+        with stgraph(paddle.fft.irfft2, self.place, self.x, self.s, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.irfft2(self.x, self.s, self.axis, self.norm),
+                y,
+                rtol=1e-5,
+                atol=0)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_input_dtype', np.random.randn(4, 4, 4), None, -1, 'backward',
+      TypeError), ('test_bool_input',
+                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+                    ).astype(np.bool8), None, -1, 'backward', TypeError),
+     ('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1,
+      'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
+      'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (1, 2, 3), -1, 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, 10, 'backward', ValueError), (
+          'test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4),
+          None, (0, 1), 'backward',
+          ValueError), ('test_norm_not_in_enum_value',
+                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
+                        None, -1, 'random', ValueError)])
+class TestHfftException(unittest.TestCase):
+    '''Test hfft with buoudary condition
+    Test case include:
+    - non complex input
+    - n out of range
+    - axis out of range
+    - norm out of range
+    '''
+
+    def test_static_hfft(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.hfft, self.place, self.x, self.n, self.axis,
+                         self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_input_dtype', np.random.randn(4, 4, 4), None, -1, 'backward',
+      TypeError), ('test_bool_input',
+                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+                    ).astype(np.bool8), None, -1, 'backward', TypeError),
+     ('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1,
+      'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
+      'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (1, 2), -1, 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, 10, 'backward', ValueError), (
+          'test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4),
+          None, (0, 1), 'backward',
+          ValueError), ('test_norm_not_in_enum_value',
+                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
+                        None, None, 'random', ValueError)])
+class TestIrfftException(unittest.TestCase):
+    '''Test Irfft with buoudary condition
+    Test case include:
+    - non complex input
+    - n out of range
+    - axis out of range
+    - norm out of range
+    - the dimensions of n and axis are different
+    '''
+
+    def test_static_irfft(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.irfft, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_input_dtype', np.random.randn(4, 4, 4), None, None, 'backward',
+      TypeError), ('test_bool_input',
+                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+                    ).astype(np.bool8), None, (-2, -1), 'backward', TypeError),
+     ('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
+      (-2, -1), 'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (0, 0), (-2, -1), 'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      3, None, 'backward',
+      ValueError), ('test_n_axis_dim',
+                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+                    (1, 2), (-1), 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, (1, 2), 'backward', ValueError), (
+          'test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
+          -1, 'backward',
+          ValueError), ('test_norm_not_in_enum_value',
+                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
+                        None, None, 'random', ValueError)])
+class TestHfft2Exception(unittest.TestCase):
+    '''Test hfft2 with buoudary condition
+    Test case include:
+    - non complex input
+    - n out of range
+    - axis out of range
+    - the dimensions of n and axis are different
+    - norm out of range
+    '''
+
+    def test_static_hfft2(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.hfft2, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_input_dtype', np.random.randn(4, 4, 4), None, None, 'backward',
+      TypeError), ('test_bool_input',
+                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+                    ).astype(np.bool8), None, (-2, -1), 'backward', TypeError),
+     ('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
+      (-2, -1), 'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (0, 0), (-2, -1), 'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      3, -1, 'backward',
+      ValueError), ('test_n_axis_dim',
+                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+                    (1, 2), (-3, -2, -1), 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, (1, 2), 'backward', ValueError), (
+          'test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
+          1, 'backward',
+          ValueError), ('test_norm_not_in_enum_value',
+                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
+                        None, None, 'random', ValueError)])
+class TestIrfft2Exception(unittest.TestCase):
+    '''Test irfft2 with buoudary condition
+    Test case include:
+    - non complex input
+    - n out of range
+    - axis out of range
+    - norm out of range
+    - the dimensions of n and axis are different
+    '''
+
+    def test_static_irfft2(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.irfft2, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_input_dtype', np.random.randn(4, 4, 4), None, None, 'backward',
+      TypeError), ('test_bool_input',
+                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+                    ).astype(np.bool8), None, (-2, -1), 'backward', TypeError),
+     ('test_n_nagative',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
+      (-2, -1), 'backward', ValueError),
+     ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      (0, 0), (-2, -1), 'backward', ValueError),
+     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+      3, -1, 'backward',
+      ValueError), ('test_n_axis_dim',
+                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+                    (1, 2), (-3, -2, -1), 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, (10, 20), 'backward', ValueError), (
+          'test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
+          1, 'backward',
+          ValueError), ('test_norm_not_in_enum_value',
+                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
+                        None, None, 'random', ValueError)])
+class TestHfftnException(unittest.TestCase):
+    '''Test hfftn with buoudary condition
+    Test case include:
+    - non complex input
+    - n out of range
+    - axis out of range
+    - norm out of range
+    - the dimensions of n and axis are different
+    '''
+
+    def test_static_hfftn(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.hfftn, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [
+        ('test_input_dtype', np.random.randn(4, 4, 4), None, None, 'backward',
+         TypeError),
+        #  ('test_bool_input',
+        #                (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
+        #                 ).astype(np.bool8), None, (-2, -1), 'backward', ValueError),
+        ('test_n_nagative',
+         np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
+         (-2, -1), 'backward', ValueError),
+        ('test_n_zero',
+         np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (0, 0),
+         (-2, -1), 'backward', ValueError),
+        ('test_n_type',
+         np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 3, -1,
+         'backward', ValueError),
+        ('test_n_axis_dim',
+         np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
+         (-3, -2, -1), 'backward', ValueError),
+        ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+         None, (10, 20), 'backward', ValueError),
+        ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
+         1, 'backward', ValueError),
+        ('test_norm_not_in_enum_value',
+         np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None,
+         'random', ValueError)
+    ])
+class TestIrfftnException(unittest.TestCase):
+    '''Test irfftn with buoudary condition
+    Test case include:
+    - non complex input
+    - n out of range
+    - axis out of range
+    - norm out of range
+    - the dimensions of n and axis are different
+    '''
+
+    def test_static_irfftn(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.irfftn, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'), (
+        'test_n_grater_than_input_length', rand_x(
+            5, max_dim_len=5), 11, -1, 'backward'),
+     ('test_n_smaller_than_input_length', rand_x(
+         5, min_dim_len=5), 3, -1,
+      'backward'), ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+class TestRfft(unittest.TestCase):
+    def test_static_rfft(self):
+        with stgraph(paddle.fft.rfft, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.rfft(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+     ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward',
+      ValueError), ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
+                    ValueError), ('test_norm_not_in_enum_value', rand_x(2),
+                                  None, -1, 'random', ValueError)])
+class TestRfftException(unittest.TestCase):
+    def test_rfft(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.rfft, self.place, self.x, self.n, self.axis,
+                         self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+        ('test_n_grater_input_length', rand_x(
+            5, max_dim_len=5), (6, 6), (0, 1), 'backward'),
+        ('test_n_smaller_than_input_length', rand_x(
+            5, min_dim_len=5), (4, 4), (0, 1), 'backward'),
+        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+        ('test_axis_none', rand_x(5), None, None, 'backward'),
+        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+    ])
+class TestRfft2(unittest.TestCase):
+    def test_static_rfft2(self):
+        with stgraph(paddle.fft.rfft2, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.rfft2(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [
+        ('test_x_complex_input', rand_x(
+            2, complex=True), None, (0, 1), 'backward', TypeError),
+        # ('test_x_not_tensor', [0, 1], None, (0, 1), 'backward', ValueError),
+        ('test_x_1dim_tensor', rand_x(1), None, (0, 1), 'backward', ValueError),
+        ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
+        ('test_n_zero', rand_x(2), 0, (0, 1), 'backward', ValueError),
+        ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
+         ValueError),
+        ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
+         ValueError),
+        ('test_axis_not_sequence', rand_x(5), None, -10, 'backward',
+         ValueError),
+        ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)
+    ])
+class TestRfft2Exception(unittest.TestCase):
+    def test_static_rfft(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.rfft2, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+     ('test_n_grater_input_length', rand_x(
+         5, max_dim_len=5), (6, 6), (1, 2),
+      'backward'), ('test_n_smaller_input_length', rand_x(
+          5, min_dim_len=5), (3, 3), (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2),
+      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
+class TestRfftn(unittest.TestCase):
+    def test_static_rfft(self):
+        with stgraph(paddle.fft.rfftn, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.rfftn(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_x_complex', rand_x(
+        4, complex=True), None, None, 'backward',
+      TypeError), ('test_n_nagative', rand_x(4), (-1, -1), (1, 2), 'backward',
+                   ValueError), ('test_n_not_sequence', rand_x(4), -1, None,
+                                 'backward', ValueError),
+     ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
+      ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random',
+                    ValueError)])
+class TestRfftnException(unittest.TestCase):
+    def test_static_rfftn(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.rfftn, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'), (
+        'test_n_grater_than_input_length', rand_x(
+            5, max_dim_len=5), 11, -1, 'backward'),
+     ('test_n_smaller_than_input_length', rand_x(
+         5, min_dim_len=5), 3, -1,
+      'backward'), ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+class TestIhfft(unittest.TestCase):
+    def test_static_ihfft(self):
+        with stgraph(paddle.fft.ihfft, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.ihfft(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+    ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+    ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)
+])
+class TestIhfftException(unittest.TestCase):
+    def test_static_ihfft(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.ihfft, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+        ('test_n_grater_input_length', rand_x(
+            5, max_dim_len=5), (11, 11), (0, 1), 'backward'),
+        ('test_n_smaller_than_input_length', rand_x(
+            5, min_dim_len=5), (1, 1), (0, 1), 'backward'),
+        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+        ('test_axis_none', rand_x(5), None, None, 'backward'),
+        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+    ])
+class TestIhfft2(unittest.TestCase):
+    def test_static_ihfft2(self):
+        with stgraph(paddle.fft.ihfft2, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.ihfft2(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [
+        ('test_x_complex_input', rand_x(
+            2, complex=True), None, (0, 1), None, ValueError),
+        # ('test_x_not_tensor', [0, 1], None, (0, 1), None, ValueError),
+        ('test_x_1dim_tensor', rand_x(1), None, (0, 1), None, ValueError),
+        ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
+        ('test_n_len_not_equal_axis', rand_x(
+            5, max_dim_len=5), 11, (0, 1), 'backward', ValueError),
+        ('test_n_zero', rand_x(2), (0, 0), (0, 1), 'backward', ValueError),
+        ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
+         ValueError),
+        ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
+         ValueError),
+        ('test_axis_not_sequence', rand_x(5), None, -10, 'backward',
+         ValueError),
+        ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)
+    ])
+class TestIhfft2Exception(unittest.TestCase):
+    def test_static_ihfft2(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.ihfft2, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+     ('test_n_grater_input_length', rand_x(
+         5, max_dim_len=5), (11, 11), (0, 1),
+      'backward'), ('test_n_smaller_input_length', rand_x(
+          5, min_dim_len=5), (1, 1), (0, 1), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2),
+      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
+class TestIhfftn(unittest.TestCase):
+    def test_static_ihfftn(self):
+        with stgraph(paddle.fft.ihfftn, self.place, self.x, self.n, self.axis,
+                     self.norm) as y:
+            np.testing.assert_allclose(
+                scipy.fft.ihfftn(self.x, self.n, self.axis, self.norm),
+                y,
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_x_complex', rand_x(
+        4, complex=True), None, None, 'backward', TypeError),
+     ('test_n_nagative', rand_x(4), -1, None, 'backward',
+      ValueError), ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
+      ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random',
+                    ValueError)])
+class TestIhfftnException(unittest.TestCase):
+    def test_static_ihfftn(self):
+        with self.assertRaises(self.expect_exception):
+            with stgraph(paddle.fft.ihfftn, self.place, self.x, self.n,
+                         self.axis, self.norm) as y:
+                pass
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+# yapf: enable
diff --git a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
new file mode 100644
index 0000000000000..a84092e36f6a8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import numpy as np
+import paddle
+
+import re
+import sys
+from spectral_op_np import fft_c2c, fft_r2c, fft_c2r
+import paddle.fluid.core as core
+import paddle.fluid.dygraph as dg
+import paddle.static as static
+from numpy.random import random as rand
+from paddle.fluid import Program, program_guard
+sys.path.append("../")
+from op_test import OpTest
+
+paddle.enable_static()
+
+TEST_CASE_NAME = 'test_case'
+
+
+def parameterize(attrs, input_values=None):
+
+    if isinstance(attrs, str):
+        attrs = [attrs]
+    input_dicts = (attrs if input_values is None else
+                   [dict(zip(attrs, vals)) for vals in input_values])
+
+    def decorator(base_class):
+        test_class_module = sys.modules[base_class.__module__].__dict__
+        for idx, input_dict in enumerate(input_dicts):
+            test_class_dict = dict(base_class.__dict__)
+            test_class_dict.update(input_dict)
+
+            name = class_name(base_class, idx, input_dict)
+
+            test_class_module[name] = type(name, (base_class, ),
+                                           test_class_dict)
+
+        for method_name in list(base_class.__dict__):
+            if method_name.startswith("test"):
+                delattr(base_class, method_name)
+        return base_class
+
+    return decorator
+
+
+def to_safe_name(s):
+    return str(re.sub("[^a-zA-Z0-9_]+", "_", s))
+
+
+def class_name(cls, num, params_dict):
+    suffix = to_safe_name(
+        next((v for v in params_dict.values() if isinstance(v, str)), ""))
+    if TEST_CASE_NAME in params_dict:
+        suffix = to_safe_name(params_dict["test_case"])
+    return "{}_{}{}".format(cls.__name__, num, suffix and "_" + suffix)
+
+
+@parameterize((TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward'), [
+    ('test_axes_is_sqe_type', (np.random.random(
+        (12, 14)) + 1j * np.random.random((12, 14))).astype(np.complex128),
+     [0, 1], 'forward', True), ('test_axis_not_last', (np.random.random(
+         (4, 4, 4)) + 1j * np.random.random((4, 4, 4))).astype(np.complex128),
+                                (0, 1), "backward", False),
+    ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random(
+        (12, 14))).astype(np.complex128), (0, ), "forward",
+     False), ('test_norm_backward', (np.random.random(
+         (12, 14)) + 1j * np.random.random((12, 14))).astype(np.complex128),
+              (0, ), "backward", True), ('test_norm_ortho', (np.random.random(
+                  (12, 14)) + 1j * np.random.random(
+                      (12, 14))).astype(np.complex128), (1, ), "ortho", True)
+])
+class TestFFTC2COp(OpTest):
+    # Because framwork not support complex numerial gradient, we skip gradient check. 
+    no_need_check_grad = True
+
+    def setUp(self):
+        self.op_type = "fft_c2c"
+
+        out = fft_c2c(self.x, self.axes, self.norm, self.forward)
+
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'axes': self.axes,
+            'normalization': self.norm,
+            "forward": self.forward
+        }
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward', 'last_dim_size'),
+    [('test_axes_is_sqe_type', (np.random.random(
+        (12, 14)) + 1j * np.random.random((12, 14))).astype(np.complex128),
+      [0, 1], 'forward', True, 26), ('test_axis_not_last', (np.random.random(
+          (4, 4, 4)) + 1j * np.random.random((4, 4, 4))).astype(np.complex128),
+                                     (0, 1), "backward", False, None),
+     ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random(
+         (12, 14))).astype(np.complex128), (0, ), "forward", False, 22),
+     ('test_norm_backward', (np.random.random((12, 14)) + 1j * np.random.random(
+         (12, 14))).astype(np.complex128), (0, ), "backward", True,
+      22), ('test_norm_ortho', (np.random.random(
+          (12, 14)) + 1j * np.random.random((12, 14))).astype(np.complex128),
+            (1, ), "ortho", True, 26)])
+class TestFFTC2ROp(OpTest):
+    # Because framwork not support complex numerial gradient, we skip gradient check. 
+    no_need_check_grad = True
+
+    def setUp(self):
+        self.op_type = "fft_c2r"
+
+        out = fft_c2r(self.x, self.axes, self.norm, self.forward,
+                      self.last_dim_size)
+
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            "axes": self.axes,
+            "normalization": self.norm,
+            "forward": self.forward,
+            "last_dim_size": self.last_dim_size
+        }
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward', 'onesided'),
+    [('test_axes_is_sqe_type', np.random.randn(12, 14).astype(np.float64),
+      (0, 1), 'forward', True,
+      True), ('test_axis_not_last', np.random.randn(4, 4, 4).astype(np.float64),
+              (0, 1), "backward", False, True),
+     ('test_norm_forward', np.random.randn(12, 14).astype(np.float64), (0, 1),
+      "forward", False, False),
+     ('test_norm_backward', np.random.randn(12, 14).astype(np.float64), (0, ),
+      "backward", True, False), ('test_norm_ortho',
+                                 np.random.randn(12, 14).astype(np.float64),
+                                 (1, ), "ortho", True, False)])
+class TestFFTR2COp(OpTest):
+    # Because framwork not support complex numerial gradient, we skip gradient check. 
+    no_need_check_grad = True
+
+    def setUp(self):
+        self.op_type = "fft_r2c"
+
+        out = fft_r2c(self.x, self.axes, self.norm, self.forward, self.onesided)
+
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'axes': self.axes,
+            'normalization': self.norm,
+            "forward": self.forward,
+            'onesided': self.onesided
+        }
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
index 083ad319305f3..4c966585d5f1f 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
@@ -48,6 +48,7 @@ def train_batch(self, batch, model, optimizer, is_mp):
         scaled.backward()  # do backward
 
         scaler.step(optimizer)  # update parameters
+        scaler.update()
         optimizer.clear_grad()
         return scaled
 
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index da335a88e3030..f269979746a08 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -79,18 +79,11 @@ def check_cost_info(self, cost_info):
         IS_WINDOWS = sys.platform.startswith('win')
 
         if core.is_compiled_with_cuda():
-            # input `a` is on CPU, 16 bytes
-            self.assertEqual(cost_info.host_memory_bytes(), 16)
             # # w,bias,b, out, memory block is at least 256 bytes on Linux
             gt = 16 * 4 if IS_WINDOWS else 256 * 4
             self.assertGreater(cost_info.device_memory_bytes(), gt)
-            self.assertGreaterEqual(cost_info.device_total_memory_bytes(),
-                                    cost_info.device_memory_bytes())
         else:
-            # x(16 bytes), w(16 bytes), bias(8 bytes), b(16 bytes), out(16 bytes)
-            self.assertGreaterEqual(cost_info.host_memory_bytes(), 72)
             self.assertEqual(cost_info.device_memory_bytes(), 0)
-            self.assertGreaterEqual(cost_info.device_total_memory_bytes(), 0)
 
 
 def build_program():
@@ -249,9 +242,6 @@ def test_with_feed(self):
     def test_with_error(self):
         feed = [{'a': np.ones([2, 2], dtype="float32")}]
 
-        with self.assertRaises(TypeError):
-            res = self.run_new_executor(feed)
-
         with self.assertRaises(TypeError):
             os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
             self._run(feed[0], add_wrong_fetch=True)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
new file mode 100644
index 0000000000000..bf457a9da40a8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertAnchorGeneratorTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            return np.random.random([batch, 3, 64, 64]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for anchor_sizes in [[64.0, 128.0, 256.0, 512.0]]:
+                for aspect_ratios in [[0.5, 1, 2], [0.4, 1.2, 3]]:
+                    for variances in [[1.0, 1.0, 1.0, 1.0],
+                                      [0.5, 1.0, 0.5, 1.0]]:
+                        for stride in [[16.0, 16.0], [16.0, 32.0]]:
+                            for offset in [0.5, 0.8]:
+                                dics = [{
+                                    "anchor_sizes": anchor_sizes,
+                                    "aspect_ratios": aspect_ratios,
+                                    "variances": variances,
+                                    "stride": stride,
+                                    "offset": offset
+                                }]
+
+                                ops_config = [{
+                                    "op_type": "anchor_generator",
+                                    "op_inputs": {
+                                        "Input": ["input_data"]
+                                    },
+                                    "op_outputs": {
+                                        "Anchors": ["output_anchors"],
+                                        "Variances": ["output_variances"]
+                                    },
+                                    "op_attrs": dics[0]
+                                }]
+                                ops = self.generate_op_config(ops_config)
+
+                                program_config = ProgramConfig(
+                                    ops=ops,
+                                    weights={},
+                                    inputs={
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(generate_input1,
+                                                             batch, dics))
+                                    },
+                                    outputs=[
+                                        "output_anchors", "output_variances"
+                                    ])
+
+                                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 3
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
index db99d1dd3504d..fd4b5ad9a72b6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
@@ -22,7 +22,6 @@
 
 class TrtConvertConv2dTest(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        # TODO: This is just the example to remove the wrong attrs.
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
@@ -30,113 +29,136 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
             for i in range(len(program_config.ops))
         ]
 
-        # groups restriction.
         if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
                 1] * attrs[0]['groups']:
             return False
 
-        # others restriction, todo.
-
         return True
 
     def sample_program_configs(self):
-        def generate_input1(attrs: List[Dict[str, Any]]):
-            # TODO: This is just the example to illustrate the releation between axis and input.
-            # for each attr, can generate different datas
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
             if attrs[0]['groups'] == 1:
-                return np.ones([2, 3, 64, 64]).astype(np.float32)
+                return np.ones([batch, 3, 64, 64]).astype(np.float32)
+            elif attrs[0]['groups'] == 2:
+                return np.ones([batch, 6, 64, 64]).astype(np.float32)
             else:
-                return np.ones([1, 3, 64, 64]).astype(np.float32)
+                return np.ones([batch, 9, 64, 64]).astype(np.float32)
 
         def generate_weight1(attrs: List[Dict[str, Any]]):
             return np.random.random([24, 3, 3, 3]).astype(np.float32)
 
-        # for strides in [[1, 1], [2, 2], [1, 2], [2, 3]]:
-        #     for paddings in [[0, 3], [3, 1], [1, 1, 1, 1]]:
-        #         for groups in [1, 2]:
-        #             for padding_algotithm in ['EXPLICIT', 'SAME', 'VALID']:
-        #                 for dilations in [[1, 1], [1, 2]]:
-        #                     for data_format in ['NCHW']:
-        for strides in [[1, 1], [2, 2]]:
-            for paddings in [[0, 3], [3, 1]]:
-                for groups in [1]:
-                    for padding_algotithm in ['EXPLICIT']:
-                        for dilations in [[1, 1]]:
-                            for data_format in ['NCHW']:
-
-                                dics = [{
-                                    "data_fromat": data_format,
-                                    "dilations": dilations,
-                                    "padding_algorithm": padding_algotithm,
-                                    "groups": groups,
-                                    "paddings": paddings,
-                                    "strides": strides,
-                                    "data_format": data_format
-                                }, {}]
-
-                                ops_config = [{
-                                    "op_type": "conv2d",
-                                    "op_inputs": {
-                                        "Input": ["input_data"],
-                                        "Filter": ["conv2d_weight"]
-                                    },
-                                    "op_outputs": {
-                                        "Output": ["conv_output_data"]
-                                    },
-                                    "op_attrs": dics[0]
-                                }, {
-                                    "op_type": "relu",
-                                    "op_inputs": {
-                                        "X": ["conv_output_data"]
-                                    },
-                                    "op_outputs": {
-                                        "Out": ["relu_output_data"]
-                                    },
-                                    "op_attrs": dics[1]
-                                }]
-                                ops = self.generate_op_config(ops_config)
-
-                                program_config = ProgramConfig(
-                                    ops=ops,
-                                    weights={
-                                        "conv2d_weight": TensorConfig(
-                                            data_gen=partial(generate_weight1,
-                                                             dics))
-                                    },
-                                    inputs={
-                                        "input_data": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             dics))
-                                    },
-                                    outputs=["relu_output_data"])
-
-                                yield program_config
+        for batch in [1, 2, 4]:
+            for strides in [[1, 1], [2, 2], [1, 2]]:
+                for paddings in [[0, 3], [1, 2, 3, 4]]:
+                    for groups in [1, 2, 3]:
+                        for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']:
+                            for dilations in [[1, 1], [2, 2], [1, 2]]:
+                                for data_format in ['NCHW']:
+
+                                    dics = [{
+                                        "data_fromat": data_format,
+                                        "dilations": dilations,
+                                        "padding_algorithm": padding_algorithm,
+                                        "groups": groups,
+                                        "paddings": paddings,
+                                        "strides": strides,
+                                        "data_format": data_format
+                                    }, {}]
+
+                                    if padding_algorithm == 'EXPLICIT':
+                                        ops_config = [{
+                                            "op_type": "conv2d",
+                                            "op_inputs": {
+                                                "Input": ["input_data"],
+                                                "Filter": ["conv2d_weight"]
+                                            },
+                                            "op_outputs": {
+                                                "Output": ["conv_output_data"]
+                                            },
+                                            "op_attrs": dics[0]
+                                        }, {
+                                            "op_type": "relu",
+                                            "op_inputs": {
+                                                "X": ["conv_output_data"]
+                                            },
+                                            "op_outputs": {
+                                                "Out": ["output_data"]
+                                            },
+                                            "op_attrs": dics[1]
+                                        }]
+                                    else:
+                                        ops_config = [{
+                                            "op_type": "conv2d",
+                                            "op_inputs": {
+                                                "Input": ["input_data"],
+                                                "Filter": ["conv2d_weight"]
+                                            },
+                                            "op_outputs": {
+                                                "Output": ["output_data"]
+                                            },
+                                            "op_attrs": dics[0]
+                                        }]
+                                    ops = self.generate_op_config(ops_config)
+
+                                    program_config = ProgramConfig(
+                                        ops=ops,
+                                        weights={
+                                            "conv2d_weight":
+                                            TensorConfig(data_gen=partial(
+                                                generate_weight1, dics))
+                                        },
+                                        inputs={
+                                            "input_data":
+                                            TensorConfig(data_gen=partial(
+                                                generate_input1, batch, dics))
+                                        },
+                                        outputs=["output_data"])
+
+                                    yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            if len(attrs[0]['paddings']) == 4:
+            if attrs[0]['groups'] == 1:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [1, 3, 32, 32],
-                    '': []
+                    "output_data": [1, 24, 32, 32]
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data": [4, 3, 64, 64],
-                    '': []
+                    "output_data": [4, 24, 64, 64]
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [1, 3, 64, 64],
-                    '': []
+                    "output_data": [1, 24, 64, 64]
+                }
+            elif attrs[0]['groups'] == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 6, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 6, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 6, 64, 64],
+                    "output_data": [1, 24, 64, 64]
                 }
             else:
                 self.dynamic_shape.min_input_shape = {
-                    "input_data": [1, 3, 32, 32]
+                    "input_data": [1, 9, 32, 32],
+                    "output_data": [1, 24, 32, 32]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "input_data": [4, 3, 64, 64]
+                    "input_data": [4, 9, 64, 64],
+                    "output_data": [4, 24, 64, 64]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "input_data": [1, 3, 64, 64]
+                    "input_data": [1, 9, 64, 64],
+                    "output_data": [1, 24, 64, 64]
                 }
 
         def clear_dynamic_shape():
@@ -145,11 +167,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            # TODO: This is just the example, need to be fixed.
-            if len(attrs[0]['paddings']) == 4:
-                return 1, 2
-            else:
-                return 1, 2
+            return 1, 2
 
         attrs = [
             program_config.ops[i].attrs
@@ -169,6 +187,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
+
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(attrs,
@@ -181,29 +200,18 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
-        # TODO(wilber): This is just the example to illustrate the skip usage.
         def teller1(program_config, predictor_config):
-            if len(program_config.ops[0].attrs['paddings']) == 4:
+            if program_config.ops[0].attrs[
+                    'padding_algorithm'] == "SAME" or program_config.ops[
+                        0].attrs['padding_algorithm'] == "VALID":
                 return True
             return False
 
         self.add_skip_case(
             teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "NOT Implemented: we need to add support in the future ....TODO, just for the example"
+            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
         )
 
-        def teller2(program_config, predictor_config):
-            if (
-                    program_config.ops[0].attrs['dilations'][0] == 1 and
-                    program_config.ops[0].attrs['dilations'][0] == 2
-            ) or program_config.ops[0].attrs['padding_algorithm'] != 'EXPLICIT':
-                return True
-            return False
-
-        self.add_skip_case(teller2, SkipReasons.TRT_NOT_SUPPORT,
-                           "TODO, just for the example")
-        pass
-
     def test(self):
         self.add_skip_trt_case()
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py
new file mode 100644
index 0000000000000..9fcbda4443de5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertConv2dFusionTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
+                1] * attrs[0]['groups']:
+            return False
+
+        if attrs[0]['groups'] <= 1:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            if attrs[0]['groups'] == 2:
+                return np.ones([batch, 6, 64, 64]).astype(np.float32)
+            else:
+                return np.ones([batch, 9, 64, 64]).astype(np.float32)
+
+        def generate_weight1(attrs: List[Dict[str, Any]]):
+            return np.random.random([24, 3, 3, 3]).astype(np.float32)
+
+        def generate_weight2(attrs: List[Dict[str, Any]]):
+            return np.random.random([24, 1, 1]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for strides in [[1, 1], [2, 2], [1, 2]]:
+                for paddings in [[0, 3], [1, 2, 3, 4]]:
+                    for groups in [2, 3]:
+                        for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']:
+                            for dilations in [[1, 1], [2, 2], [1, 2]]:
+                                for data_format in ['NCHW']:
+
+                                    dics = [{
+                                        "data_fromat": data_format,
+                                        "dilations": dilations,
+                                        "padding_algorithm": padding_algorithm,
+                                        "groups": groups,
+                                        "paddings": paddings,
+                                        "strides": strides,
+                                        "data_format": data_format
+                                    }, {
+                                        "axis": 1
+                                    }]
+
+                                    ops_config = [{
+                                        "op_type": "conv2d",
+                                        "op_inputs": {
+                                            "Input": ["input_data"],
+                                            "Filter": ["conv2d_weight"]
+                                        },
+                                        "op_outputs": {
+                                            "Output": ["conv_output_data"]
+                                        },
+                                        "op_attrs": dics[0]
+                                    }, {
+                                        "op_type": "elementwise_add",
+                                        "op_inputs": {
+                                            "X": ["conv_output_data"],
+                                            "Y": ["elementwise_weight"]
+                                        },
+                                        "op_outputs": {
+                                            "Out": ["output_data"]
+                                        },
+                                        "op_attrs": dics[1]
+                                    }]
+
+                                    ops = self.generate_op_config(ops_config)
+
+                                    program_config = ProgramConfig(
+                                        ops=ops,
+                                        weights={
+                                            "conv2d_weight":
+                                            TensorConfig(data_gen=partial(
+                                                generate_weight1, dics)),
+                                            "elementwise_weight": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_weight2, dics))
+                                        },
+                                        inputs={
+                                            "input_data":
+                                            TensorConfig(data_gen=partial(
+                                                generate_input1, batch, dics))
+                                        },
+                                        outputs=["output_data"])
+
+                                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if attrs[0]['groups'] == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 6, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 6, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 6, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 9, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 9, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 9, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs[
+                    'padding_algorithm'] == "SAME" or program_config.ops[
+                        0].attrs['padding_algorithm'] == "VALID":
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+    def test_quant(self):
+        self.add_skip_trt_case()
+        self.run_test(quant=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
new file mode 100644
index 0000000000000..82dd492b5275f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
+                1] * attrs[0]['groups']:
+            return False
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, num_channels, attrs: List[Dict[str, Any]]):
+            return np.ones([batch, num_channels, 64, 64]).astype(np.float32)
+
+        def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
+            if attrs[0]['groups'] == 1:
+                return np.random.random(
+                    [num_channels, num_channels, 3, 3]).astype(np.float32)
+            else:
+                return np.random.random(
+                    [num_channels, int(num_channels / 2), 3, 3]).astype(
+                        np.float32)
+
+        for num_channels in [2, 4, 6]:
+            for batch in [1, 2, 4]:
+                for strides in [[1, 1], [2, 2], [1, 2]]:
+                    for paddings in [[0, 3], [1, 2, 3, 4]]:
+                        for groups in [2]:
+                            for padding_algorithm in [
+                                    'EXPLICIT', 'SAME', 'VALID'
+                            ]:
+                                for dilations in [[1, 1], [2, 2], [1, 2]]:
+                                    for data_format in ['NCHW']:
+
+                                        self.num_channels = num_channels
+                                        dics = [{
+                                            "data_fromat": data_format,
+                                            "dilations": dilations,
+                                            "padding_algorithm":
+                                            padding_algorithm,
+                                            "groups": groups,
+                                            "paddings": paddings,
+                                            "strides": strides,
+                                            "data_format": data_format,
+                                            "output_size": [],
+                                            "output_padding": []
+                                        }]
+
+                                        ops_config = [{
+                                            "op_type": "conv2d_transpose",
+                                            "op_inputs": {
+                                                "Input": ["input_data"],
+                                                "Filter": ["conv2d_weight"]
+                                            },
+                                            "op_outputs": {
+                                                "Output": ["output_data"]
+                                            },
+                                            "op_attrs": dics[0]
+                                        }]
+                                        ops = self.generate_op_config(
+                                            ops_config)
+
+                                        program_config = ProgramConfig(
+                                            ops=ops,
+                                            weights={
+                                                "conv2d_weight":
+                                                TensorConfig(data_gen=partial(
+                                                    generate_weight1,
+                                                    num_channels, dics))
+                                            },
+                                            inputs={
+                                                "input_data":
+                                                TensorConfig(data_gen=partial(
+                                                    generate_input1, batch,
+                                                    num_channels, dics))
+                                            },
+                                            outputs=["output_data"])
+
+                                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.num_channels == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 2, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 2, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 2, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            elif self.num_channels == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 4, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 4, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 4, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 6, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 6, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 6, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs[
+                    'padding_algorithm'] == "SAME" or program_config.ops[
+                        0].attrs['padding_algorithm'] == "VALID":
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
+        )
+
+        def teller2(program_config, predictor_config):
+            if program_config.ops[0].attrs['dilations'][
+                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+    def test_quant(self):
+        self.add_skip_trt_case()
+        self.run_test(quant=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
new file mode 100644
index 0000000000000..e6b3aa30bf896
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertDepthwiseConv2dTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
+                1] * attrs[0]['groups']:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            if attrs[0]['groups'] == 1:
+                return np.ones([batch, 1, 64, 64]).astype(np.float32)
+            elif attrs[0]['groups'] == 2:
+                return np.ones([batch, 2, 64, 64]).astype(np.float32)
+            else:
+                return np.ones([batch, 3, 64, 64]).astype(np.float32)
+
+        def generate_weight1(attrs: List[Dict[str, Any]]):
+            return np.random.random([24, 1, 3, 3]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for strides in [[1, 1], [2, 2], [1, 2]]:
+                for paddings in [[0, 3], [1, 2, 3, 4]]:
+                    for groups in [1, 2, 3]:
+                        for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']:
+                            for dilations in [[1, 1], [2, 2], [1, 2]]:
+                                for data_format in ['NCHW']:
+
+                                    dics = [{
+                                        "data_fromat": data_format,
+                                        "dilations": dilations,
+                                        "padding_algorithm": padding_algorithm,
+                                        "groups": groups,
+                                        "paddings": paddings,
+                                        "strides": strides,
+                                        "data_format": data_format
+                                    }]
+
+                                    ops_config = [{
+                                        "op_type": "depthwise_conv2d",
+                                        "op_inputs": {
+                                            "Input": ["input_data"],
+                                            "Filter": ["conv2d_weight"]
+                                        },
+                                        "op_outputs": {
+                                            "Output": ["output_data"]
+                                        },
+                                        "op_attrs": dics[0]
+                                    }]
+                                    ops = self.generate_op_config(ops_config)
+
+                                    program_config = ProgramConfig(
+                                        ops=ops,
+                                        weights={
+                                            "conv2d_weight":
+                                            TensorConfig(data_gen=partial(
+                                                generate_weight1, dics))
+                                        },
+                                        inputs={
+                                            "input_data":
+                                            TensorConfig(data_gen=partial(
+                                                generate_input1, batch, dics))
+                                        },
+                                        outputs=["output_data"])
+
+                                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if attrs[0]['groups'] == 1:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 1, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 1, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 1, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            elif attrs[0]['groups'] == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 2, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 2, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 2, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 3, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 3, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 3, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs[
+                    'padding_algorithm'] == "SAME" or program_config.ops[
+                        0].attrs['padding_algorithm'] == "VALID":
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+    def test_quant(self):
+        self.add_skip_trt_case()
+        self.run_test(quant=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
new file mode 100644
index 0000000000000..473925c6cdb79
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
+                1] * attrs[0]['groups']:
+            return False
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[1]:
+            return False
+
+        if inputs['input_data'].shape[1] != attrs[0]['groups']:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            return np.ones(
+                [batch, attrs[0]['groups'], 64, 64]).astype(np.float32)
+
+        def generate_weight1(attrs: List[Dict[str, Any]]):
+            return np.random.random(
+                [attrs[0]['groups'], 1, 3, 3]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for strides in [[1, 1], [2, 2], [1, 2]]:
+                for paddings in [[0, 3], [1, 2, 3, 4]]:
+                    for groups in [1, 2, 3]:
+                        for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']:
+                            for dilations in [[1, 1], [2, 2], [1, 2]]:
+                                for data_format in ['NCHW']:
+
+                                    dics = [{
+                                        "data_fromat": data_format,
+                                        "dilations": dilations,
+                                        "padding_algorithm": padding_algorithm,
+                                        "groups": groups,
+                                        "paddings": paddings,
+                                        "strides": strides,
+                                        "data_format": data_format,
+                                        "output_size": [],
+                                        "output_padding": []
+                                    }]
+
+                                    ops_config = [{
+                                        "op_type": "conv2d_transpose",
+                                        "op_inputs": {
+                                            "Input": ["input_data"],
+                                            "Filter": ["conv2d_weight"]
+                                        },
+                                        "op_outputs": {
+                                            "Output": ["output_data"]
+                                        },
+                                        "op_attrs": dics[0]
+                                    }]
+                                    ops = self.generate_op_config(ops_config)
+
+                                    program_config = ProgramConfig(
+                                        ops=ops,
+                                        weights={
+                                            "conv2d_weight":
+                                            TensorConfig(data_gen=partial(
+                                                generate_weight1, dics))
+                                        },
+                                        inputs={
+                                            "input_data":
+                                            TensorConfig(data_gen=partial(
+                                                generate_input1, batch, dics))
+                                        },
+                                        outputs=["output_data"])
+
+                                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, attrs[0]['groups'], 32, 32],
+                "output_data": [1, attrs[0]['groups'], 32, 32]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [4, attrs[0]['groups'], 64, 64],
+                "output_data": [4, attrs[0]['groups'], 64, 64]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, attrs[0]['groups'], 64, 64],
+                "output_data": [1, attrs[0]['groups'], 64, 64]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs[
+                    'padding_algorithm'] == "SAME" or program_config.ops[
+                        0].attrs['padding_algorithm'] == "VALID":
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
+        )
+
+        def teller2(program_config, predictor_config):
+            if program_config.ops[0].attrs['dilations'][
+                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+    def test_quant(self):
+        self.add_skip_trt_case()
+        self.run_test(quant=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
new file mode 100644
index 0000000000000..f25a3b82476dc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertEmbEltwiseLayernormTest1(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(batch, input_size):
+            return np.random.randint(
+                0, 7, size=(batch, input_size, 1)).astype(np.int64)
+
+        def generate_weight1(size11, size2):
+            return np.random.randn(size11, size2).astype(np.float32)
+
+        def generate_weight2(size12, size2):
+            return np.random.randn(size12, size2).astype(np.float32)
+
+        def generate_weight3(size13, size2):
+            return np.random.randn(size13, size2).astype(np.float32)
+
+        def generate_weight4(size2):
+            return np.random.randn(size2).astype(np.float32)
+
+        for input_size in [16, 128]:
+            for batch in [1, 2, 4]:
+                for size1 in [[8, 513, 768], [513, 768, 8], [768, 8, 513]]:
+                    size11 = size1[0]
+                    size12 = size1[1]
+                    size13 = size1[2]
+                    for size2 in [32, 768]:
+                        for norm_axis in [2]:
+                            for epsilon in [0.0001, 0.0005]:
+                                for axis1 in [0, -1]:
+                                    for axis2 in [0, -1]:
+                                        for type in [
+                                                "lookup_table",
+                                                "lookup_table_v2"
+                                        ]:
+                                            dics = [{
+                                                "is_sparse": False,
+                                                "is_distributed": False,
+                                                "padding_idx": -1,
+                                                "is_test": True
+                                            }, {
+                                                "is_sparse": False,
+                                                "is_distributed": False,
+                                                "padding_idx": -1,
+                                            }, {
+                                                "axis": axis1
+                                            }, {
+                                                "axis": axis2
+                                            }, {
+                                                "begin_norm_axis": norm_axis,
+                                                "epsilon": epsilon
+                                            }]
+                                            ops_config = [{
+                                                "op_type": type,
+                                                "op_inputs": {
+                                                    "Ids": ["input_data1"],
+                                                    "W": ["embedding1_weight"]
+                                                },
+                                                "op_outputs": {
+                                                    "Out":
+                                                    ["embedding1_output"]
+                                                },
+                                                "op_attrs": dics[0]
+                                                if type == "lookup_table" else
+                                                dics[1]
+                                            }, {
+                                                "op_type": type,
+                                                "op_inputs": {
+                                                    "Ids": ["input_data2"],
+                                                    "W": ["embedding2_weight"]
+                                                },
+                                                "op_outputs": {
+                                                    "Out":
+                                                    ["embedding2_output"]
+                                                },
+                                                "op_attrs": dics[0]
+                                                if type == "lookup_table" else
+                                                dics[1]
+                                            }, {
+                                                "op_type": type,
+                                                "op_inputs": {
+                                                    "Ids": ["input_data3"],
+                                                    "W": ["embedding3_weight"]
+                                                },
+                                                "op_outputs": {
+                                                    "Out":
+                                                    ["embedding3_output"]
+                                                },
+                                                "op_attrs": dics[0]
+                                                if type == "lookup_table" else
+                                                dics[1]
+                                            }, {
+                                                "op_type": "elementwise_add",
+                                                "op_inputs": {
+                                                    "X": ["embedding2_output"],
+                                                    "Y": ["embedding3_output"]
+                                                },
+                                                "op_outputs": {
+                                                    "Out": [
+                                                        "elementwise_add1_output"
+                                                    ]
+                                                },
+                                                "op_attrs": dics[2]
+                                            }, {
+                                                "op_type": "elementwise_add",
+                                                "op_inputs": {
+                                                    "X": [
+                                                        "elementwise_add1_output"
+                                                    ],
+                                                    "Y": ["embedding1_output"]
+                                                },
+                                                "op_outputs": {
+                                                    "Out": [
+                                                        "elementwise_add2_output"
+                                                    ]
+                                                },
+                                                "op_attrs": dics[3]
+                                            }, {
+                                                "op_type": "layer_norm",
+                                                "op_inputs": {
+                                                    "X": [
+                                                        "elementwise_add2_output"
+                                                    ],
+                                                    "Bias":
+                                                    ["layer_norm_bias"],
+                                                    "Scale":
+                                                    ["layer_norm_scale"]
+                                                },
+                                                "op_outputs": {
+                                                    "Y":
+                                                    ["layer_norm_output1"],
+                                                    "Mean":
+                                                    ["layer_norm_output2"],
+                                                    "Variance":
+                                                    ["layer_norm_output3"]
+                                                },
+                                                "op_attrs": dics[4]
+                                            }]
+                                            ops = self.generate_op_config(
+                                                ops_config)
+
+                                            program_config = ProgramConfig(
+                                                ops=ops,
+                                                weights={
+                                                    "embedding1_weight":
+                                                    TensorConfig(
+                                                        data_gen=partial(
+                                                            generate_weight1,
+                                                            size11, size2)),
+                                                    "embedding2_weight":
+                                                    TensorConfig(
+                                                        data_gen=partial(
+                                                            generate_weight2,
+                                                            size12, size2)),
+                                                    "embedding3_weight":
+                                                    TensorConfig(
+                                                        data_gen=partial(
+                                                            generate_weight3,
+                                                            size13, size2)),
+                                                    "layer_norm_bias":
+                                                    TensorConfig(
+                                                        data_gen=partial(
+                                                            generate_weight4,
+                                                            size2)),
+                                                    "layer_norm_scale":
+                                                    TensorConfig(
+                                                        data_gen=partial(
+                                                            generate_weight4,
+                                                            size2))
+                                                },
+                                                inputs={
+                                                    "input_data1": TensorConfig(
+                                                        data_gen=partial(
+                                                            generate_input,
+                                                            batch, input_size)),
+                                                    "input_data2": TensorConfig(
+                                                        data_gen=partial(
+                                                            generate_input,
+                                                            batch, input_size)),
+                                                    "input_data3": TensorConfig(
+                                                        data_gen=partial(
+                                                            generate_input,
+                                                            batch, input_size))
+                                                },
+                                                outputs=["layer_norm_output1"])
+
+                                            yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data1": [1, 4, 1],
+                "input_data2": [1, 4, 1],
+                "input_data3": [1, 4, 1]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data1": [4, 512, 1],
+                "input_data2": [4, 512, 1],
+                "input_data3": [4, 512, 1]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data1": [2, 128, 1],
+                "input_data2": [2, 128, 1],
+                "input_data3": [2, 128, 1]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 5), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 5), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 4), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 4), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py
new file mode 100644
index 0000000000000..d803d9e461613
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertHardSigmoidTest_dim_2(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for shape in [[batch, 64], [batch, 32, 64], [batch, 64, 32, 128]]:
+                self.input_dim = len(shape)
+                for slope in [0.1, 0.5]:
+                    for offset in [0.2, 0.7]:
+                        dics = [{"slope": slope, "offset": offset}]
+                        ops_config = [{
+                            "op_type": "hard_sigmoid",
+                            "op_inputs": {
+                                "X": ["input_data"],
+                            },
+                            "op_outputs": {
+                                "Out": ["output_data"]
+                            },
+                            "op_attrs": dics[0]
+                        }]
+                        ops = self.generate_op_config(ops_config)
+
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "input_data": TensorConfig(
+                                    data_gen=partial(generate_input, shape))
+                            },
+                            outputs=["output_data"])
+
+                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.input_dim == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
+                self.dynamic_shape.max_input_shape = {"input_data": [64, 128]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]}
+            elif self.input_dim == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]}
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [64, 128, 256]
+                }
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 64]}
+            elif self.input_dim == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 8, 8, 4]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [64, 128, 256, 512]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 16, 64, 128]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+    def add_skip_trt_case(self):
+        def teller(program_config, predictor_config):
+            if len(self.dynamic_shape.
+                   min_input_shape) == 0 and self.input_dim == 2:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller, SkipReasons.TRT_NOT_SUPPORT,
+            "Need to repair the case: the output of trt and GPU has diff when inputs' dims is 2 in static shape mode."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
new file mode 100644
index 0000000000000..283a19ec00574
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+
+class TrtConvertHardSwishTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if attrs[0]['threshold'] <= 0 or attrs[0]['scale'] <= 0:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            return np.ones([1, 3, 64, 64]).astype(np.float32)
+
+        for threshold in [6.0, 7.0, 100.0, 0.0, -1.0]:
+            for scale in [5.0, 6.0, 7.0, -1.0, 0.0, 100.0]:
+                for offset in [3.0, 4.0, 5.0, -1.0, 0.0, 100.0]:
+                    dics = [{
+                        "threshold": threshold,
+                        "scale": scale,
+                        "offset": offset
+                    }]
+
+                    ops_config = [{
+                        "op_type": "hard_swish",
+                        "op_inputs": {
+                            "X": ["input_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["hard_swish_output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(data_gen=partial(
+                                generate_input1, dics))
+                        },
+                        outputs=["hard_swish_output_data"])
+
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
new file mode 100644
index 0000000000000..e772df522b5c5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -0,0 +1,436 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(batch, dim1):
+            return np.random.randn(batch, dim1, 768).astype(np.float32)
+
+        def generate_input2(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight1():
+            return np.random.randn(768, 768).astype(np.float32)
+
+        def generate_weight2():
+            return np.random.randn(768).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for reshape_shape in [[0, 0, 12, 64]]:
+                for dim1 in [128]:
+                    input2_shapes = [[batch, reshape_shape[2], dim1, dim1],
+                                     [batch, 1, 1, dim1]]
+                    for input2_shape in input2_shapes:
+                        for axis in [0]:
+                            dics = [{
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1
+                            }, {
+                                "axis": 2
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1
+                            }, {
+                                "axis": 2
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1
+                            }, {
+                                "axis": 2
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "scale": 0.125,
+                                "bias": 0.0,
+                                "bias_after_scale": True
+                            }, {
+                                "alpha": 1.0,
+                                "transpose_X": False,
+                                "transpose_Y": True,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": []
+                            }, {
+                                "axis": axis
+                            }, {
+                                "axis": -1,
+                                "is_test": True
+                            }, {
+                                "seed": 0,
+                                "dropout_prob": 0.10000000149011612,
+                                "dropout_implementation": "upscale_in_train",
+                                "fix_seed": False,
+                                "is_test": True
+                            }, {
+                                "alpha": 1.0,
+                                "transpose_X": False,
+                                "transpose_Y": False,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": []
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "shape": [0, 0, 768]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1
+                            }]
+
+                            ops_config = [
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul1_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul1_output"]
+                                    },
+                                    "op_attrs": dics[0]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul1_output"],
+                                        "Y": ["elementwise_add1_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add1_output"]
+                                    },
+                                    "op_attrs": dics[1]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add1_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape21_output"],
+                                        "XShape": ["reshape21_output_xshape"]
+                                    },
+                                    "op_attrs": dics[2]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape21_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose21_output"],
+                                        "XShape":
+                                        ["transpose21_output_xshape"]
+                                    },
+                                    "op_attrs": dics[3]
+                                },
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul2_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul2_output"]
+                                    },
+                                    "op_attrs": dics[4]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul2_output"],
+                                        "Y": ["elementwise_add2_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add2_output"]
+                                    },
+                                    "op_attrs": dics[5]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add2_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape22_output"],
+                                        "XShape": ["reshape22_output_xshape"]
+                                    },
+                                    "op_attrs": dics[6]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape22_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose22_output"],
+                                        "XShape":
+                                        ["transpose22_output_xshape"]
+                                    },
+                                    "op_attrs": dics[7]
+                                },
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul3_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul3_output"]
+                                    },
+                                    "op_attrs": dics[8]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul3_output"],
+                                        "Y": ["elementwise_add3_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add3_output"]
+                                    },
+                                    "op_attrs": dics[9]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add3_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape23_output"],
+                                        "XShape": ["reshape23_output_xshape"]
+                                    },
+                                    "op_attrs": dics[10]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape23_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose23_output"],
+                                        "XShape":
+                                        ["transpose23_output_xshape"]
+                                    },
+                                    "op_attrs": dics[11]
+                                },
+                                {
+                                    "op_type": "scale",
+                                    "op_inputs": {
+                                        "X": ["transpose23_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["scale_output"]
+                                    },
+                                    "op_attrs": dics[12]
+                                },
+                                {
+                                    "op_type": "matmul",
+                                    "op_inputs": {
+                                        "X": ["scale_output"],
+                                        "Y": ["transpose22_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["matmul1_output"]
+                                    },
+                                    "op_attrs": dics[13]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["matmul1_output"],
+                                        "Y": ["input_data2"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add4_output"]
+                                    },
+                                    "op_attrs": dics[14]
+                                },
+                                {
+                                    "op_type": "softmax",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add4_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["softmax_output"]
+                                    },
+                                    "op_attrs": dics[15]
+                                },
+                                {
+                                    "op_type": "dropout",
+                                    "op_inputs": {
+                                        "X": ["softmax_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["dropout3_output"]
+                                    },
+                                    "op_attrs": dics[16]
+                                },
+                                {
+                                    "op_type": "matmul",
+                                    "op_inputs": {
+                                        "X": ["dropout3_output"],
+                                        "Y": ["transpose21_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["matmul2_output"]
+                                    },
+                                    "op_attrs": dics[17]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["matmul2_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose24_output"],
+                                        "XShape":
+                                        ["transpose24_output_xshape"]
+                                    },
+                                    "op_attrs": dics[18]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["transpose24_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape24_output"],
+                                        "XShape": ["reshape24_output_xshape"]
+                                    },
+                                    "op_attrs": dics[19]
+                                },
+                                # In order to fuse ops with 
+                                # multihead_matmul_fuse_pass_v2, the last op
+                                # must be mul.
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["reshape24_output"],
+                                        "Y": ["mul4_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul4_output"]
+                                    },
+                                    "op_attrs": dics[20]
+                                }
+                            ]
+                            ops = self.generate_op_config(ops_config)
+
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={
+                                    "mul1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul4_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "elementwise_add1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                    "elementwise_add2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                    "elementwise_add3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                },
+                                inputs={
+                                    "input_data1": TensorConfig(
+                                        data_gen=partial(generate_input1, batch,
+                                                         dim1)),
+                                    "input_data2": TensorConfig(
+                                        data_gen=partial(generate_input2,
+                                                         input2_shape)),
+                                },
+                                outputs=["mul4_output"])
+
+                            yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            # The last dim of input1 and input2 should be static.
+            self.dynamic_shape.min_input_shape = {
+                "input_data1": [1, 8, 768],
+                "input_data2": [1, 1, 1, 128],
+                "reshape24_output": [1, 128, 768]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data1": [16, 512, 768],
+                "input_data2": [16, 256, 512, 128],
+                "reshape24_output": [1, 128, 768]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data1": [8, 128, 768],
+                "input_data2": [8, 32, 64, 128],
+                "reshape24_output": [1, 128, 768]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 4), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 4), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
new file mode 100644
index 0000000000000..3e923b1bd89d6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertPool2dTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            return np.ones([1, 3, 64, 64]).astype(np.float32)
+
+        def generate_weight1(attrs: List[Dict[str, Any]]):
+            return np.random.random([24, 3, 3, 3]).astype(np.float32)
+
+        for strides in [[1, 1], [2, 2], [1, 2]]:
+            for paddings in [[0, 2], [0, 3], [1, 2, 3, 4]]:
+                for pooling_type in ['max', 'avg']:
+                    for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']:
+                        for ksize in [[2, 3], [3, 3]]:
+                            for data_format in ['NCHW']:
+                                for global_pooling in [True, False]:
+                                    for exclusive in [True, False]:
+                                        for adaptive in [True, False]:
+                                            for ceil_mode in [True, False]:
+                                                self.paddings = paddings
+
+                                                dics = [{
+                                                    "pooling_type":
+                                                    pooling_type,
+                                                    "ksize": ksize,
+                                                    "data_fromat": data_format,
+                                                    "padding_algorithm":
+                                                    padding_algotithm,
+                                                    "paddings": paddings,
+                                                    "strides": strides,
+                                                    "data_format": data_format,
+                                                    "global_pooling":
+                                                    global_pooling,
+                                                    "exclusive": exclusive,
+                                                    "adaptive": adaptive,
+                                                    "ceil_mode": ceil_mode
+                                                }]
+
+                                                ops_config = [{
+                                                    "op_type": "pool2d",
+                                                    "op_inputs": {
+                                                        "X": ["input_data"],
+                                                    },
+                                                    "op_outputs": {
+                                                        "Out": ["output_data"]
+                                                    },
+                                                    "op_attrs": dics[0]
+                                                }]
+                                                ops = self.generate_op_config(
+                                                    ops_config)
+
+                                                program_config = ProgramConfig(
+                                                    ops=ops,
+                                                    weights={},
+                                                    inputs={
+                                                        "input_data":
+                                                        TensorConfig(
+                                                            data_gen=partial(
+                                                                generate_input1,
+                                                                dics))
+                                                    },
+                                                    outputs=["output_data"])
+
+                                                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.paddings == [0, 3] or attrs[0][
+                    'global_pooling'] == True or attrs[0]['ceil_mode'] == True:
+                return 0, 3
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(program_config.ops[0].attrs['paddings']) == 4:
+                return True
+            return False
+
+        self.add_skip_case(teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+                           "4-dims paddings are not support for trt now.")
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
index fd69a8bf6c37f..a7ae6a635ecdf 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
@@ -66,15 +66,18 @@ def test_check_output(self):
                 self.check_output_with_option(use_gpu)
 
 
-class TRTDynamicShapeOutOfBound2Test(TRTDynamicShapeTest):
-    def set_feeds(self):
-        return {"data": np.random.random([2, 3, 16, 16]).astype("float32"), }
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            with self.assertRaises(Exception):
-                self.check_output_with_option(use_gpu)
+# (wanghaipeng03) temporarily disable this test, in some cases, this test code
+#  doesn't raise exception, TRT just gives the right result
+# class TRTDynamicShapeOutOfBound2Test(TRTDynamicShapeTest):
+#     def set_feeds(self):
+#         return {"data": np.random.random([2, 3, 16, 16]).astype("float32"), }
+# 
+#     def test_check_output(self):
+#         if core.is_compiled_with_cuda():
+#             use_gpu = True
+#             with self.assertRaises(Exception):
+#                 self.check_output_with_option(use_gpu)
+# 
 
 
 class TRTDynamicShapeOutOfBound3Test(TRTDynamicShapeTest):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
index 6957a4ceb26de..3ac185fbb04ac 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
@@ -25,6 +25,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.inference as paddle_infer
+import shutil
 
 from paddle import compat as cpt
 from typing import Optional, List, Callable, Dict, Any, Set
@@ -68,18 +69,21 @@ def __init__(self, methodName='runTest'):
             max_batch_size=4,
             min_subgraph_size=0,
             precision=paddle_infer.PrecisionType.Float32,
-            use_static=False,
+            use_static=True,
             use_calib_mode=False)
         self.dynamic_shape = self.DynamicShapeParam({}, {}, {}, False)
         self.num_percent_cases = float(
             os.getenv(
                 'TEST_NUM_PERCENT_CASES', default='1.0'))
+        abs_dir = os.path.abspath(os.path.dirname(__file__))
+        cache_dir = str(self.__module__) + '_trt_cache_dir'
+        self.trt_cache_dir = os.path.join(abs_dir, cache_dir)
 
     def create_inference_config(self, use_trt=True) -> paddle_infer.Config:
         config = paddle_infer.Config()
-        config.disable_glog_info()
+        # config.disable_glog_info()
         config.enable_use_gpu(100, 0)
-        config.set_optim_cache_dir('trt_convert_cache_dir')
+        config.set_optim_cache_dir(self.trt_cache_dir)
         if use_trt:
             config.switch_ir_debug()
             config.enable_tensorrt_engine(
@@ -218,6 +222,9 @@ def run_test(self, quant=False):
             for pred_config, nodes_num, threshold in self.sample_predictor_configs(
                     prog_config):
 
+                if os.path.exists(self.trt_cache_dir):
+                    shutil.rmtree(self.trt_cache_dir)
+
                 if isinstance(threshold, float):
                     atol = threshold
                     rtol = 1e-8
@@ -261,9 +268,9 @@ def run_test(self, quant=False):
                     if not skip_flag:
                         self.assert_op_size(nodes_num[0], nodes_num[1])
                     # deserialize test
-                    #if nodes_num[0] > 0:
-                    #    self.run_test_config(model, params, prog_config,
-                    #                         pred_config_deserialize, feed_data)
+                    if nodes_num[0] > 0:
+                        self.run_test_config(model, params, prog_config,
+                                             pred_config_deserialize, feed_data)
                 except Exception as e:
                     self.fail_log(
                         str(prog_config) + ' vs ' + self.inference_config_str(
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
new file mode 100644
index 0000000000000..62c8c9571b793
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
@@ -0,0 +1,236 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from paddle import enable_static
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
+from paddle.fluid.framework import _current_expected_place
+import paddle.fluid.core as core
+
+
+@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)),
+                    "GPU is not supported")
+class TestMKLDNNElementwiseSubOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMKLDNNElementwiseSubOp2(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+
+class TestMKLDNNElementwiseSubOp3(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+
+class TestMKLDNNElementwiseSubOp4(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+
+class TestMKLDNNElementwiseSubOp5(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+
+class TestMKLDNNElementwiseSubOp_broadcast(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x - self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
+        self.out = self.x - self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ignore_y(self):
+        pass
+
+    def test_check_grad_ignore_x(self):
+        pass
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestBf16(TestMKLDNNElementwiseSubOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.x_bf16 = convert_float_to_uint16(self.x)
+        self.y_bf16 = convert_float_to_uint16(self.y)
+        self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.mkldnn_data_type = "bfloat16"
+
+    def init_input_output(self):
+        self.x = np.random.random(100, ).astype(self.dtype)
+        self.y = np.random.random(100, ).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            user_defined_grads=[self.x, -self.x],
+            user_defined_grad_outputs=[self.x_bf16])
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            user_defined_grads=[-self.y],
+            user_defined_grad_outputs=[self.y_bf16])
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            user_defined_grads=[self.x],
+            user_defined_grad_outputs=[self.x_bf16])
+
+
+class TestBf16Broadcasting(TestBf16):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def compute_reduced_gradients(self, out_grads):
+        part_sum = np.add.reduceat(out_grads, [0], axis=0)
+        part_sum = np.add.reduceat(part_sum, [0], axis=1)
+        part_sum = np.add.reduceat(part_sum, [0], axis=2)
+        return -part_sum.flatten()
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            user_defined_grads=[
+                self.x, self.compute_reduced_gradients(self.x)
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            user_defined_grads=[self.compute_reduced_gradients(self.x)],
+            user_defined_grad_outputs=[self.x_bf16])
+
+
+class TestInt8(TestMKLDNNElementwiseSubOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self._cpu_only = True
+
+    def init_dtype(self):
+        self.dtype = np.int8
+
+    def init_input_output(self):
+        self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
+        self.y = np.random.randint(0, 3, (12, 9)).astype("int8")
+        self.out = np.subtract(self.x, self.y)
+
+    def init_scales(self):
+        self.attrs['Scale_x'] = 1.0
+        self.attrs['Scale_y'] = 1.0
+        self.attrs['Scale_out'] = 1.0
+
+    def test_check_output(self):
+        self.init_scales()
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ignore_x(self):
+        pass
+
+    def test_check_grad_ignore_y(self):
+        pass
+
+
+if __name__ == '__main__':
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_randperm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_randperm_op_npu.py
new file mode 100644
index 0000000000000..4ec353c55deb1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_randperm_op_npu.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+from paddle.static import program_guard, Program
+from test_randperm_op import check_randperm_out, error_msg, convert_dtype
+
+paddle.enable_static()
+
+
+class TestRandpermOp(OpTest):
+    """ Test randperm op."""
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "randperm"
+        self.n = 200
+        self.dtype = "int64"
+
+        self.inputs = {}
+        self.outputs = {"Out": np.zeros((self.n)).astype(self.dtype)}
+        self.init_attrs()
+        self.attrs = {
+            "n": self.n,
+            "dtype": convert_dtype(self.dtype),
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def _get_places(self):
+        return [paddle.NPUPlace(0)]
+
+    def init_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        out_np = np.array(outs[0])
+        self.assertTrue(
+            check_randperm_out(self.n, out_np), msg=error_msg(out_np))
+
+
+class TestRandpermOpN(TestRandpermOp):
+    def init_attrs(self):
+        self.n = 10000
+
+
+class TestRandpermOpInt32(TestRandpermOp):
+    def init_attrs(self):
+        self.dtype = "int32"
+
+
+class TestRandpermOpFloat32(TestRandpermOp):
+    def init_attrs(self):
+        self.dtype = "float32"
+
+
+class TestRandpermOpFloat64(TestRandpermOp):
+    def init_attrs(self):
+        self.dtype = "float64"
+
+
+class TestRandpermOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            self.assertRaises(ValueError, paddle.randperm, -3)
+            self.assertRaises(TypeError, paddle.randperm, 10, 'int8')
+
+
+class TestRandpermAPI(unittest.TestCase):
+    def test_out(self):
+        n = 10
+        place = paddle.NPUPlace(0)
+        with program_guard(Program(), Program()):
+            x1 = paddle.randperm(n)
+            x2 = paddle.randperm(n, 'float32')
+
+            exe = paddle.static.Executor(place)
+            res = exe.run(fetch_list=[x1, x2])
+
+            self.assertEqual(res[0].dtype, np.int64)
+            self.assertEqual(res[1].dtype, np.float32)
+            self.assertTrue(check_randperm_out(n, res[0]))
+            self.assertTrue(check_randperm_out(n, res[1]))
+
+
+class TestRandpermImperative(unittest.TestCase):
+    def test_out(self):
+        paddle.disable_static(paddle.NPUPlace(0))
+        n = 10
+        for dtype in ['int32', np.int64, 'float32', 'float64']:
+            data_p = paddle.randperm(n, dtype)
+            data_np = data_p.numpy()
+            self.assertTrue(
+                check_randperm_out(n, data_np), msg=error_msg(data_np))
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
index e02a9dc446a54..e819f422f2b44 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
@@ -68,7 +68,7 @@ def _run_static(self):
         return out
 
     def _run_dynamic(self):
-        paddle.disable_static()
+        paddle.disable_static(paddle.NPUPlace(0))
         x = paddle.ones(shape=self.shape, dtype=self.dtype)
         self._call_setitem(x)
         out = x.numpy()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 2f9c05302276a..a50a667f663ee 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1368,6 +1368,12 @@ def check_output_customized(self, checker, custom_place=None):
             outs.sort(key=len)
             checker(outs)
 
+    def check_output_with_place_customized(self, checker, place):
+        outs = self.calc_output(place)
+        outs = [np.array(out) for out in outs]
+        outs.sort(key=len)
+        checker(outs)
+
     def _assert_is_close(self, numeric_grads, analytic_grads, names,
                          max_relative_error, msg_prefix):
         for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
index 4c323a2511f5b..4c5338314afb1 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@@ -22,6 +22,8 @@
                         RandomSampler, WeightedRandomSampler
 from paddle.io import DistributedBatchSampler
 
+IMAGE_SIZE = 32
+
 
 class RandomDataset(Dataset):
     def __init__(self, sample_num, class_num):
@@ -31,7 +33,7 @@ def __init__(self, sample_num, class_num):
     def __getitem__(self, idx):
         np.random.seed(idx)
         image = np.random.random([IMAGE_SIZE]).astype('float32')
-        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        label = np.random.randint(0, self.class_num - 1, (1, )).astype('int64')
         return image, label
 
     def __len__(self):
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index fc0a64b18a7af..715579c332dfa 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -204,6 +204,7 @@ def add_error_cases(suite):
     suite.addTest(
         Conv1DErrorTestCase(
             methodName='runTest', padding=[1, 2, 3, 4, 5]))
+    suite.addTest(Conv1DErrorTestCase(methodName='runTest', dilation=-10))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_cost_model.py b/python/paddle/fluid/tests/unittests/test_cost_model.py
new file mode 100644
index 0000000000000..483f665fde7e8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cost_model.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.fluid.core as core
+
+paddle.enable_static()
+
+device = "gpu" if core.is_compiled_with_cuda() else "cpu"
+
+
+class TestCostModel(unittest.TestCase):
+    def test_profiler_measure_empty_program(self):
+        cost_model = core.CostModel()
+        empty_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        cost_data = cost_model.profile_measure(empty_program, startup_program,
+                                               device, ["time"])
+        self.assertEqual(cost_data.get_whole_time_ms(), 0)
+
+    def test_profiler_measure_program(self):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            # TODO(zhhsplendid): support paddle.static.data, which is uninitialized data
+            data = paddle.ones(name='X', shape=[16, 100], dtype='float32')
+            hidden = paddle.static.nn.fc(data, 10)
+            loss = paddle.mean(hidden)
+        cost_model = core.CostModel()
+        cost_data = cost_model.profile_measure(main_program, startup_program,
+                                               device, ["time"])
+        fc_op_time = cost_data.get_op_time_ms(0)
+        mean_op_time = cost_data.get_op_time_ms(1)
+        self.assertGreater(fc_op_time, 0)
+        self.assertGreater(mean_op_time, 0)
+        self.assertGreaterEqual(cost_data.get_whole_time_ms(),
+                                fc_op_time + mean_op_time)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_determinant_op.py b/python/paddle/fluid/tests/unittests/test_determinant_op.py
new file mode 100644
index 0000000000000..f8110bffa2f71
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_determinant_op.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.tensor as tensor
+
+paddle.enable_static()
+
+
+class TestDeterminantOp(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "determinant"
+        self.outputs = {'Out': self.target}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input'], ['Out'])
+
+    def init_data(self):
+        np.random.seed(0)
+        self.case = np.random.rand(3, 3, 3, 5, 5).astype('float64')
+        self.inputs = {'Input': self.case}
+        self.target = np.linalg.det(self.case)
+
+
+class TestDeterminantOpCase1(TestDeterminantOp):
+    def init_data(self):
+        np.random.seed(0)
+        self.case = np.random.rand(10, 10).astype('float32')
+        self.inputs = {'Input': self.case}
+        self.target = np.linalg.det(self.case)
+
+
+class TestDeterminantOpCase2(TestDeterminantOp):
+    def init_data(self):
+        np.random.seed(0)
+        # not invertible matrix
+        self.case = np.ones([4, 2, 4, 4]).astype('float64')
+        self.inputs = {'Input': self.case}
+        self.target = np.linalg.det(self.case)
+
+
+class TestDeterminantAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.shape = [3, 3, 5, 5]
+        self.x = np.random.random(self.shape).astype(np.float32)
+        self.place = paddle.CPUPlace()
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', self.shape)
+            out = paddle.linalg.det(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        out_ref = np.linalg.det(self.x)
+
+        for out in res:
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-03), True)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+        x_tensor = paddle.to_tensor(self.x)
+        out = paddle.linalg.det(x_tensor)
+        out_ref = np.linalg.det(self.x)
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-03), True)
+        paddle.enable_static()
+
+
+class TestSlogDeterminantOp(OpTest):
+    def setUp(self):
+        self.op_type = "slogdeterminant"
+        self.init_data()
+        self.outputs = {'Out': self.target}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        # the slog det's grad value is always huge
+        self.check_grad(['Input'], ['Out'], max_relative_error=0.1)
+
+    def init_data(self):
+        np.random.seed(0)
+        self.case = np.random.rand(4, 5, 5).astype('float64')
+        self.inputs = {'Input': self.case}
+        self.target = np.array(np.linalg.slogdet(self.case))
+
+
+class TestSlogDeterminantOpCase1(TestSlogDeterminantOp):
+    def init_data(self):
+        np.random.seed(0)
+        self.case = np.random.rand(2, 2, 5, 5).astype(np.float32)
+        self.inputs = {'Input': self.case}
+        self.target = np.array(np.linalg.slogdet(self.case))
+
+
+class TestSlogDeterminantAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.shape = [3, 3, 5, 5]
+        self.x = np.random.random(self.shape).astype(np.float32)
+        self.place = paddle.CPUPlace()
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', self.shape)
+            out = paddle.linalg.slogdet(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        out_ref = np.array(np.linalg.slogdet(self.x))
+        for out in res:
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-03), True)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+        x_tensor = paddle.to_tensor(self.x)
+        out = paddle.linalg.slogdet(x_tensor)
+        out_ref = np.array(np.linalg.slogdet(self.x))
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-03), True)
+        paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eigh_op.py b/python/paddle/fluid/tests/unittests/test_eigh_op.py
index e434364702525..8e8c9df199f14 100644
--- a/python/paddle/fluid/tests/unittests/test_eigh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigh_op.py
@@ -140,7 +140,7 @@ def test_in_static_mode(self):
         self.check_static_complex_result()
 
     def test_in_dynamic_mode(self):
-        paddle.disable_static(self.place)
+        paddle.disable_static()
         input_real_data = paddle.to_tensor(self.real_data)
         expected_w, expected_v = np.linalg.eigh(self.real_data)
         actual_w, actual_v = paddle.linalg.eigh(input_real_data)
@@ -152,7 +152,7 @@ def test_in_dynamic_mode(self):
         self.compare_result(actual_w, actual_v.numpy(), expected_w, expected_v)
 
     def test_eigh_grad(self):
-        paddle.disable_static(self.place)
+        paddle.disable_static()
         x = paddle.to_tensor(self.complex_data, stop_gradient=False)
         w, v = paddle.linalg.eigh(x)
         (w.sum() + paddle.abs(v).sum()).backward()
diff --git a/python/paddle/fluid/tests/unittests/test_eigvals_op.py b/python/paddle/fluid/tests/unittests/test_eigvals_op.py
new file mode 100644
index 0000000000000..eff9d4ea6e801
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eigvals_op.py
@@ -0,0 +1,307 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+from op_test import OpTest
+
+np.set_printoptions(threshold=np.inf)
+
+
+def np_eigvals(a):
+    res = np.linalg.eigvals(a)
+    if (a.dtype == np.float32 or a.dtype == np.complex64):
+        res = res.astype(np.complex64)
+    else:
+        res = res.astype(np.complex128)
+
+    return res
+
+
+class TestEigvalsOp(OpTest):
+    def setUp(self):
+        np.random.seed(0)
+        paddle.enable_static()
+        self.op_type = "eigvals"
+        self.set_dtype()
+        self.set_input_dims()
+        self.set_input_data()
+
+        np_output = np_eigvals(self.input_data)
+
+        self.inputs = {'X': self.input_data}
+        self.outputs = {'Out': np_output}
+
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_dims(self):
+        self.input_dims = (5, 5)
+
+    def set_input_data(self):
+        if (self.dtype == np.float32 or self.dtype == np.float64):
+            self.input_data = np.random.random(self.input_dims).astype(
+                self.dtype)
+        else:
+            self.input_data = (
+                np.random.random(self.input_dims) +
+                np.random.random(self.input_dims) * 1j).astype(self.dtype)
+
+    def test_check_output(self):
+        self.__class__.no_need_check_grad = True
+        self.check_output_with_place_customized(
+            checker=self.verify_output, place=core.CPUPlace())
+
+    def verify_output(self, outs):
+        actual_outs = np.sort(np.array(outs[0]))
+        expect_outs = np.sort(np.array(self.outputs['Out']))
+        self.assertTrue(
+            actual_outs.shape == expect_outs.shape, "Output shape has diff.\n"
+            "Expect shape " + str(expect_outs.shape) + "\n" + "But Got" +
+            str(actual_outs.shape) + " in class " + self.__class__.__name__)
+
+        n_dim = actual_outs.shape[-1]
+        for actual_row, expect_row in zip(
+                actual_outs.reshape((-1, n_dim)),
+                expect_outs.reshape((-1, n_dim))):
+            is_mapped_index = np.zeros((n_dim, ))
+            for i in range(n_dim):
+                is_mapped = False
+                for j in range(n_dim):
+                    if is_mapped_index[j] == 0 and np.isclose(
+                            np.array(actual_row[i]),
+                            np.array(expect_row[j]),
+                            atol=1e-5):
+                        is_mapped_index[j] = True
+                        is_mapped = True
+                        break
+                self.assertTrue(
+                    is_mapped,
+                    "Output has diff in class " + self.__class__.__name__ +
+                    "\nExpect " + str(expect_outs) + "\n" + "But Got" +
+                    str(actual_outs) + "\nThe data " + str(actual_row[i]) +
+                    " in " + str(actual_row) + " mismatch.")
+
+
+class TestEigvalsOpFloat64(TestEigvalsOp):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+
+class TestEigvalsOpComplex64(TestEigvalsOp):
+    def set_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestEigvalsOpComplex128(TestEigvalsOp):
+    def set_dtype(self):
+        self.dtype = np.complex128
+
+
+class TestEigvalsOpLargeScare(TestEigvalsOp):
+    def set_input_dims(self):
+        self.input_dims = (128, 128)
+
+
+class TestEigvalsOpLargeScareFloat64(TestEigvalsOpLargeScare):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+
+class TestEigvalsOpLargeScareComplex64(TestEigvalsOpLargeScare):
+    def set_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestEigvalsOpLargeScareComplex128(TestEigvalsOpLargeScare):
+    def set_dtype(self):
+        self.dtype = np.complex128
+
+
+class TestEigvalsOpBatch1(TestEigvalsOp):
+    def set_input_dims(self):
+        self.input_dims = (1, 2, 3, 4, 4)
+
+
+class TestEigvalsOpBatch2(TestEigvalsOp):
+    def set_input_dims(self):
+        self.input_dims = (3, 1, 4, 5, 5)
+
+
+class TestEigvalsOpBatch3(TestEigvalsOp):
+    def set_input_dims(self):
+        self.input_dims = (6, 2, 9, 6, 6)
+
+
+class TestEigvalsAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+
+        self.small_dims = [6, 6]
+        self.large_dims = [128, 128]
+        self.batch_dims = [6, 9, 2, 2]
+
+        self.set_dtype()
+
+        self.input_dims = self.small_dims
+        self.set_input_data()
+        self.small_input = np.copy(self.input_data)
+
+        self.input_dims = self.large_dims
+        self.set_input_data()
+        self.large_input = np.copy(self.input_data)
+
+        self.input_dims = self.batch_dims
+        self.set_input_data()
+        self.batch_input = np.copy(self.input_data)
+
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        if (self.dtype == np.float32 or self.dtype == np.float64):
+            self.input_data = np.random.random(self.input_dims).astype(
+                self.dtype)
+        else:
+            self.input_data = (
+                np.random.random(self.input_dims) +
+                np.random.random(self.input_dims) * 1j).astype(self.dtype)
+
+    def verify_output(self, actural_outs, expect_outs):
+        actual_outs = np.array(actural_outs)
+        expect_outs = np.array(expect_outs)
+        self.assertTrue(
+            actual_outs.shape == expect_outs.shape, "Output shape has diff."
+            "\nExpect shape " + str(expect_outs.shape) + "\n" + "But Got" +
+            str(actual_outs.shape) + " in class " + self.__class__.__name__)
+
+        n_dim = actual_outs.shape[-1]
+        for actual_row, expect_row in zip(
+                actual_outs.reshape((-1, n_dim)),
+                expect_outs.reshape((-1, n_dim))):
+            is_mapped_index = np.zeros((n_dim, ))
+            for i in range(n_dim):
+                is_mapped = False
+                for j in range(n_dim):
+                    if is_mapped_index[j] == 0 and np.isclose(
+                            np.array(actual_row[i]),
+                            np.array(expect_row[j]),
+                            atol=1e-5):
+                        is_mapped_index[j] = True
+                        is_mapped = True
+                        break
+                self.assertTrue(
+                    is_mapped,
+                    "Output has diff in class " + self.__class__.__name__ +
+                    "\nExpect " + str(expect_outs) + "\n" + "But Got" +
+                    str(actual_outs) + "\nThe data " + str(actual_row[i]) +
+                    " in " + str(actual_row) + " mismatch.")
+
+    def run_dygraph(self, place):
+        paddle.disable_static()
+        paddle.set_device("cpu")
+        small_input_tensor = paddle.to_tensor(self.small_input, place=place)
+        large_input_tensor = paddle.to_tensor(self.large_input, place=place)
+        batch_input_tensor = paddle.to_tensor(self.batch_input, place=place)
+
+        paddle_outs = paddle.linalg.eigvals(small_input_tensor, name='small_x')
+        np_outs = np_eigvals(self.small_input)
+        self.verify_output(paddle_outs, np_outs)
+
+        paddle_outs = paddle.linalg.eigvals(large_input_tensor, name='large_x')
+        np_outs = np_eigvals(self.large_input)
+        self.verify_output(paddle_outs, np_outs)
+
+        paddle_outs = paddle.linalg.eigvals(batch_input_tensor, name='small_x')
+        np_outs = np_eigvals(self.batch_input)
+        self.verify_output(paddle_outs, np_outs)
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            small_input_tensor = paddle.static.data(
+                name='small_x', shape=self.small_dims, dtype=self.dtype)
+            large_input_tensor = paddle.static.data(
+                name='large_x', shape=self.large_dims, dtype=self.dtype)
+            batch_input_tensor = paddle.static.data(
+                name='batch_x', shape=self.batch_dims, dtype=self.dtype)
+
+            small_outs = paddle.linalg.eigvals(
+                small_input_tensor, name='small_x')
+            large_outs = paddle.linalg.eigvals(
+                large_input_tensor, name='large_x')
+            batch_outs = paddle.linalg.eigvals(
+                batch_input_tensor, name='batch_x')
+
+            exe = paddle.static.Executor(place)
+
+            paddle_outs = exe.run(
+                feed={
+                    "small_x": self.small_input,
+                    "large_x": self.large_input,
+                    "batch_x": self.batch_input
+                },
+                fetch_list=[small_outs, large_outs, batch_outs])
+
+            np_outs = np_eigvals(self.small_input)
+            self.verify_output(paddle_outs[0], np_outs)
+
+            np_outs = np_eigvals(self.large_input)
+            self.verify_output(paddle_outs[1], np_outs)
+
+            np_outs = np_eigvals(self.batch_input)
+            self.verify_output(paddle_outs[2], np_outs)
+
+    def test_cases(self):
+        places = [core.CPUPlace()]
+        #if core.is_compiled_with_cuda():
+        #    places.append(core.CUDAPlace(0))
+        for place in places:
+            self.run_dygraph(place)
+            self.run_static(place)
+
+    def test_error(self):
+        paddle.disable_static()
+        x = paddle.to_tensor([1])
+        with self.assertRaises(BaseException):
+            paddle.linalg.eigvals(x)
+
+        self.input_dims = [1, 2, 3, 4]
+        self.set_input_data()
+        x = paddle.to_tensor(self.input_data)
+        with self.assertRaises(BaseException):
+            paddle.linalg.eigvals(x)
+
+
+class TestEigvalsAPIFloat64(TestEigvalsAPI):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+
+class TestEigvalsAPIComplex64(TestEigvalsAPI):
+    def set_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestEigvalsAPIComplex128(TestEigvalsAPI):
+    def set_dtype(self):
+        self.dtype = np.complex128
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ema_fleet.py b/python/paddle/fluid/tests/unittests/test_ema_fleet.py
new file mode 100644
index 0000000000000..e0526deb59af8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ema_fleet.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.utils as utils
+import paddle.static as static
+
+
+def gen_data():
+    return np.random.random(size=(10, 5)).astype('float32')
+
+
+class TestFleetStaticEMA(unittest.TestCase):
+    def setUp(self):
+        self._places = [paddle.CPUPlace()]
+        if paddle.device.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+        self._ema_decay = 0.999
+        self._param_name = "fc.weight"
+        self._train_program = static.Program()
+        self._startup_prog = static.Program()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.without_graph_optimization = True
+        paddle.distributed.fleet.init(is_collective=True, strategy=strategy)
+
+        with static.program_guard(self._train_program, self._startup_prog):
+            with utils.unique_name.guard():
+                data = static.data(name='x', shape=[-1, 5], dtype='float32')
+                hidden = static.nn.fc(x=data,
+                                      size=10,
+                                      weight_attr=self._param_name)
+                cost = paddle.mean(hidden)
+
+                self._test_program = static.default_main_program().clone(
+                    for_test=True)
+
+                optimizer = paddle.optimizer.Adam(learning_rate=0.001)
+                optimizer = paddle.distributed.fleet.distributed_optimizer(
+                    optimizer, strategy)
+                optimizer.minimize(cost)
+
+                self._ema = static.ExponentialMovingAverage(self._ema_decay)
+                self._ema.update()
+
+    def train(self, place, restore):
+        exe = static.Executor(place)
+        exe.run(self._startup_prog)
+
+        params = []
+        for pass_id in range(2):
+            for batch_id in range(3):
+                exe.run(program=self._train_program, feed={'x': gen_data()})
+                tmp_param = np.array(static.global_scope().find_var(
+                    self._param_name).get_tensor())
+                params.append(tmp_param)
+
+            with self._ema.apply(exe, restore):
+                final_ema = np.array(static.global_scope().find_var(
+                    self._param_name).get_tensor())
+                exe.run(program=self._test_program, feed={'x': gen_data()})
+            if not restore:
+                self._ema.restore(exe)
+
+        return params, final_ema
+
+    def test_check_ema(self):
+        for place in self._places:
+            for restore in (True, False):
+                params, final_ema = self.train(place, restore)
+                manu_ema = np.zeros_like(final_ema)
+                if len(params) > 0:
+                    for param in params:
+                        manu_ema = self._ema_decay * manu_ema + (
+                            1 - self._ema_decay) * param
+                    manu_ema = manu_ema / (1.0 - self._ema_decay**len(params))
+                self.assertTrue(np.allclose(manu_ema, final_ema))
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
index 6de2d2fb09228..db8689c14c30f 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
@@ -71,6 +71,8 @@ def test_opt_sharding_with_pp(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_sync_comm_stream'
         ])
 
@@ -152,6 +154,8 @@ def test_opt_sharding_with_pp_with_allreduce_fuse(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_sync_comm_stream'
         ])
 
@@ -212,7 +216,9 @@ def test_opt_sharding_with_pp_amp_gclip(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id',
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
-            'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream'
+            'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -284,7 +290,9 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id',
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
-            'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream'
+            'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -376,7 +384,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary(self):
             'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
-            'c_comm_init', 'c_sync_comm_stream'
+            'c_comm_init', 'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -427,7 +435,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary_card1(self):
             'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
-            'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream'
+            'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 6b0a7b79c232c..61d98d32ec5fd 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -762,7 +762,9 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
-            'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream'
+            'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -928,7 +930,10 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
-            'c_sync_comm_stream'
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -1023,7 +1028,11 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
-            'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream'
+            'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -1121,7 +1130,10 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse(
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
-            'c_sync_comm_stream'
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -1211,7 +1223,9 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
-            'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream'
+            'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
         ])
 
         self.assertEqual(main_prog_op_types, [
diff --git a/python/paddle/fluid/tests/unittests/test_frame_op.py b/python/paddle/fluid/tests/unittests/test_frame_op.py
new file mode 100644
index 0000000000000..f26662dcf4f26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_frame_op.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.lib.stride_tricks import as_strided
+import paddle
+import unittest
+
+from op_test import OpTest
+
+
+def frame_from_librosa(x, frame_length, hop_length, axis=-1):
+    if axis == -1 and not x.flags["C_CONTIGUOUS"]:
+        x = np.ascontiguousarray(x)
+    elif axis == 0 and not x.flags["F_CONTIGUOUS"]:
+        x = np.asfortranarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * x.itemsize]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * x.itemsize] + list(strides)
+
+    else:
+        raise ValueError("Frame axis={} must be either 0 or -1".format(axis))
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+class TestFrameOp(OpTest):
+    def setUp(self):
+        self.op_type = "frame"
+        self.shape, self.type, self.attrs = self.initTestCase()
+        self.inputs = {
+            'X': np.random.random(size=self.shape).astype(self.type),
+        }
+        self.outputs = {
+            'Out': frame_from_librosa(
+                x=self.inputs['X'], **self.attrs)
+        }
+
+    def initTestCase(self):
+        input_shape = (150, )
+        input_type = 'float64'
+        attrs = {
+            'frame_length': 50,
+            'hop_length': 15,
+            'axis': -1,
+        }
+        return input_shape, input_type, attrs
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+        paddle.disable_static()
+
+    def test_check_grad_normal(self):
+        paddle.enable_static()
+        self.check_grad(['X'], 'Out')
+        paddle.disable_static()
+
+
+class TestCase1(TestFrameOp):
+    def initTestCase(self):
+        input_shape = (150, )
+        input_type = 'float64'
+        attrs = {
+            'frame_length': 50,
+            'hop_length': 15,
+            'axis': 0,
+        }
+        return input_shape, input_type, attrs
+
+
+class TestCase2(TestFrameOp):
+    def initTestCase(self):
+        input_shape = (8, 150)
+        input_type = 'float64'
+        attrs = {
+            'frame_length': 50,
+            'hop_length': 15,
+            'axis': -1,
+        }
+        return input_shape, input_type, attrs
+
+
+class TestCase3(TestFrameOp):
+    def initTestCase(self):
+        input_shape = (150, 8)
+        input_type = 'float64'
+        attrs = {
+            'frame_length': 50,
+            'hop_length': 15,
+            'axis': 0,
+        }
+        return input_shape, input_type, attrs
+
+
+class TestCase4(TestFrameOp):
+    def initTestCase(self):
+        input_shape = (4, 2, 150)
+        input_type = 'float64'
+        attrs = {
+            'frame_length': 50,
+            'hop_length': 15,
+            'axis': -1,
+        }
+        return input_shape, input_type, attrs
+
+
+class TestCase5(TestFrameOp):
+    def initTestCase(self):
+        input_shape = (150, 4, 2)
+        input_type = 'float64'
+        attrs = {
+            'frame_length': 50,
+            'hop_length': 15,
+            'axis': 0,
+        }
+        return input_shape, input_type, attrs
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 330c4c5ffec3d..5f1f4a4641168 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -18,6 +18,8 @@
 import numpy as np
 import six
 from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
+import paddle.nn as nn
+from paddle.static import InputSpec
 
 if fluid.core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -89,6 +91,21 @@ def test_custom_op_list(self):
                     set(black_list) ==
                     (set(base_black_list) - {"log"}) | {"conv2d"})
 
+            base_white_list = fluid.dygraph.amp.auto_cast.PURE_FP16_WHITE_LIST
+            base_black_list = fluid.dygraph.amp.auto_cast.PURE_FP16_BLACK_LIST
+            with fluid.dygraph.amp_guard(
+                    custom_white_list=["log"],
+                    custom_black_list=["conv2d"],
+                    level='O2'):
+                white_list, black_list = tracer._get_amp_op_list()
+                self.assertTrue(
+                    set(white_list) ==
+                    (set(base_white_list) | {"log"}) - {"conv2d"})
+
+                self.assertTrue(
+                    set(black_list) ==
+                    (set(base_black_list) - {"log"}) | {"conv2d"})
+
     def test_custom_op_list_exception(self):
         inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
 
@@ -115,13 +132,36 @@ def test_amp_guard_upsupported_fp16_op(self):
             conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
             data = fluid.dygraph.to_variable(data)
             with fluid.dygraph.amp_guard(True):
-                out_fp16 = conv2d(data)
-                out_fp32 = paddle.expand_as(
-                    out_fp16, out_fp16)  # expand_as_v2 has no fp16 kernel
+                out_amp_fp16 = conv2d(data)
+                out_amp_fp32 = paddle.expand_as(
+                    out_amp_fp16,
+                    out_amp_fp16)  # expand_as_v2 has no fp16 kernel
+
+            with fluid.dygraph.amp_guard(True, level='O2'):
+                out_purefp16_fp16 = conv2d(data)
+                out_purefp16_fp32 = paddle.expand_as(
+                    out_purefp16_fp16,
+                    out_purefp16_fp16)  # expand_as_v2 has no fp16 kernel
 
         self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
-        self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
-        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(out_amp_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
+        self.assertTrue(out_amp_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(
+            out_purefp16_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
+        self.assertTrue(
+            out_purefp16_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+
+    def test_mode_exception(self):
+        def func():
+            data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+            with fluid.dygraph.guard():
+                conv2d = fluid.dygraph.Conv2D(
+                    3, 2, 3, bias_attr=False, act=None)
+                data = fluid.dygraph.to_variable(data)
+                with fluid.dygraph.amp_guard(level='O'):
+                    out = conv2d(data)
+
+        self.assertRaises(ValueError, func)
 
 
 class TestAmpScaler(unittest.TestCase):
@@ -182,6 +222,47 @@ def run_simple_conv(inp_np, use_scaler=True):
                 np.allclose(outs_with_scaler[1][i][0].numpy(),
                             outs_no_scaler[1][i][0].numpy()), True)
 
+    def test_step(self):
+        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
+
+        def run_simple_conv(inp_np, use_scaler=True):
+            paddle.seed(10)
+            paddle.framework.random._manual_program_seed(10)
+            with fluid.dygraph.guard():
+                model = SimpleConv(
+                    num_channels=3,
+                    num_filters=64,
+                    filter_size=7,
+                    stride=2,
+                    act='relu')
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                                 parameters=model.parameters())
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+                data = fluid.dygraph.to_variable(inp_np)
+
+                out = model(data)
+                loss = fluid.layers.mean(out)
+                if use_scaler:
+                    print('use scaler')
+                    scaled_loss = scaler.scale(loss)
+                    scaled_loss.backward()
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    print('use no scaler')
+                    loss.backward()
+                    optimizer.step()
+            return optimizer._parameter_list
+
+        outs_with_scaler = run_simple_conv(inp_np, use_scaler=True)
+        outs_no_scaler = run_simple_conv(inp_np, use_scaler=False)
+
+        for i in range(len(outs_with_scaler)):
+            # check each parameter
+            self.assertEqual(
+                np.allclose(outs_with_scaler[i].numpy(),
+                            outs_no_scaler[i].numpy()), True)
+
     def test_nan_inf(self):
         inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
         inp_np[0][1][2][3] = np.nan
@@ -212,6 +293,52 @@ def test_nan_inf(self):
                 self.assertTrue(
                     np.array_equal(param.numpy(), params_init[param.name]))
 
+    def test_step_update_exception(self):
+        def func1():
+            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                             parameters=model.parameters())
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            data = paddle.rand([10, 3, 32, 32])
+            conv = model(data)
+            loss = paddle.mean(conv)
+            scaled = scaler.scale(loss)
+            scaled.backward()
+            scaler.unscale_(optimizer)
+            scaler.unscale_(optimizer)
+
+        self.assertRaises(RuntimeError, func1)
+
+        def func2():
+            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                             parameters=model.parameters())
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            data = paddle.rand([10, 3, 32, 32])
+            conv = model(data)
+            loss = paddle.mean(conv)
+            scaled = scaler.scale(loss)
+            scaled.backward()
+            scaler.step(optimizer)
+            scaler.unscale_(optimizer)
+
+        self.assertRaises(RuntimeError, func2)
+
+        def func3():
+            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                             parameters=model.parameters())
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            data = paddle.rand([10, 3, 32, 32])
+            conv = model(data)
+            loss = paddle.mean(conv)
+            scaled = scaler.scale(loss)
+            scaled.backward()
+            scaler.step(optimizer)
+            scaler.step(optimizer)
+
+        self.assertRaises(RuntimeError, func3)
+
     def test_get_and_set(self):
         with fluid.dygraph.guard():
             scaler = paddle.amp.GradScaler(
@@ -386,6 +513,315 @@ def test_with_state_dict(self):
             np.allclose(out_use_state_dict[0], out_no_state_dict[0]))
 
 
+class TestAmpDecorator(unittest.TestCase):
+    def test_mode_exception(self):
+        def func():
+            with fluid.dygraph.guard():
+                model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+                opt = paddle.optimizer.SGD(parameters=model.parameters())
+                model, opt = paddle.amp.decorate(
+                    models=model, optimizers=opt, level='O')
+
+        self.assertRaises(ValueError, func)
+
+    def test_input_formate_exception(self):
+        def test_model_error():
+            with fluid.dygraph.guard():
+                model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+                opt = paddle.optimizer.SGD(parameters=model.parameters())
+                paddle.amp.decorate(models=None, optimizers=opt, level='O2')
+
+        self.assertRaises(TypeError, test_model_error)
+
+        def test_optimizer_error():
+            with fluid.dygraph.guard():
+                model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+                paddle.amp.decorate(models=model, optimizers=None, level='O2')
+
+        self.assertRaises(TypeError, test_optimizer_error)
+
+    def test_input_type_exception(self):
+        def test_error_model_optimizer():
+            class MyModel(object):
+                def __init__(self):
+                    print("A fake Model")
+
+            class MyOptimizer(object):
+                def __init__(self):
+                    print("A fake Optimizer")
+
+            model = MyModel()
+            opt = MyOptimizer()
+            with fluid.dygraph.guard():
+                paddle.amp.decorate(models=model, optimizers=opt, level='O2')
+
+        self.assertRaises(TypeError, test_error_model_optimizer)
+
+    def test_set_master_weight(self):
+        model1 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+        opt1 = paddle.optimizer.Adam(
+            learning_rate=0.0001,
+            parameters=model1.parameters(),
+            multi_precision=True)
+        model1, opt1 = paddle.amp.decorate(
+            models=model1, optimizers=opt1, level='O2', master_weight=None)
+        self.assertEqual(opt1._multi_precision, True)
+
+        model2 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+        opt2 = paddle.optimizer.Adam(
+            learning_rate=0.0001,
+            parameters=model2.parameters(),
+            multi_precision=False)
+        model2, opt2 = paddle.amp.decorate(
+            models=model2, optimizers=opt2, level='O2', master_weight=None)
+        self.assertEqual(opt2._multi_precision, True)
+
+        model3 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+        opt3 = paddle.optimizer.Adam(
+            learning_rate=0.0001, parameters=model3.parameters())
+        model3, opt3 = paddle.amp.decorate(
+            models=model3, optimizers=opt3, level='O2', master_weight=True)
+        self.assertEqual(opt3._multi_precision, True)
+
+        model4 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+        opt4 = paddle.optimizer.Adam(
+            learning_rate=0.0001, parameters=model4.parameters())
+        model4, opt4 = paddle.amp.decorate(
+            models=model4, optimizers=opt4, level='O2', master_weight=False)
+        self.assertEqual(opt4._multi_precision, False)
+
+
+class TestPureFp16SaveLoad(unittest.TestCase):
+    def test_save_dtype_exception(self):
+        def func():
+            paddle.disable_static()
+            model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+            opt = paddle.optimizer.SGD(parameters=model.parameters())
+            paddle.amp.decorate(
+                models=model, optimizers=opt, level='O2', save_dtype='int')
+
+        self.assertRaises(ValueError, func)
+
+    def train_resnet(self,
+                     enable_amp=True,
+                     use_data_loader=True,
+                     use_save_load=True):
+        seed = 90
+
+        batch_size = train_parameters["batch_size"]
+        batch_num = 4
+
+        paddle.seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
+
+        resnet = ResNet(use_cudnn=True)
+        optimizer = optimizer_setting(
+            train_parameters, parameter_list=resnet.parameters())
+        np.random.seed(seed)
+        train_reader = paddle.batch(
+            paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
+
+        dy_param_init_value = {}
+        for param in resnet.parameters():
+            dy_param_init_value[param.name] = param.numpy()
+
+        program = None
+        scaler = paddle.amp.GradScaler(
+            enable=enable_amp, init_loss_scaling=2.**10)
+
+        if use_data_loader:
+            train_reader = paddle.batch(
+                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+                batch_size=batch_size,
+                drop_last=True)
+            train_loader = fluid.io.DataLoader.from_generator(
+                capacity=4,
+                use_double_buffer=True,
+                iterable=True,
+                return_list=True)
+            train_loader.set_sample_list_generator(train_reader)
+            train_reader = train_loader
+
+        if enable_amp:
+            resnet, optimizer = paddle.amp.decorate(
+                models=resnet,
+                optimizers=optimizer,
+                level='O2',
+                save_dtype='float32')
+
+        for batch_id, data in enumerate(train_reader()):
+            if batch_id >= batch_num:
+                break
+            if use_data_loader:
+                img, label = data
+            else:
+                dy_x_data = np.array([x[0].reshape(3, 224, 224)
+                                      for x in data]).astype('float32')
+                if len(np.array([x[1]
+                                 for x in data]).astype('int64')) != batch_size:
+                    continue
+                y_data = np.array(
+                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                img = paddle.to_tensor(dy_x_data)
+                label = paddle.to_tensor(y_data)
+            label.stop_gradient = True
+
+            with paddle.amp.auto_cast(enable=enable_amp, level='O2'):
+                out = resnet(img)
+
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+            loss = paddle.cast(loss, 'float32')
+            avg_loss = paddle.mean(x=loss)
+
+            dy_out = avg_loss.numpy()
+
+            scaled_loss = scaler.scale(avg_loss)
+            scaled_loss.backward()
+
+            scaler.minimize(optimizer, scaled_loss)
+
+            dy_grad_value = {}
+            for param in resnet.parameters():
+                if param.trainable:
+                    np_array = np.array(param._grad_ivar().value().get_tensor())
+                    dy_grad_value[param.name + fluid.core.grad_var_suffix(
+                    )] = np_array
+
+            resnet.clear_gradients()
+
+            dy_param_value = {}
+            for param in resnet.parameters():
+                dy_param_value[param.name] = param.numpy()
+
+            if use_save_load and batch_id == 2:
+                # paddle.save
+                obj = {
+                    'model': resnet.state_dict(),
+                    'opt': optimizer.state_dict(),
+                    'scaler': scaler.state_dict()
+                }
+                path = 'model.pdparams'
+                paddle.save(obj, path)
+                # paddle.load
+                obj_load = paddle.load(path)
+                resnet = ResNet(use_cudnn=True)
+                optimizer = optimizer_setting(
+                    train_parameters, parameter_list=resnet.parameters())
+                resnet.set_state_dict(obj_load['model'])
+                optimizer.set_state_dict(obj_load['opt'])
+                scaler.load_state_dict(obj_load['scaler'])
+                resnet, optimizer = paddle.amp.decorate(
+                    models=resnet,
+                    optimizers=optimizer,
+                    level='O2',
+                    save_dtype='float32')
+
+        if use_data_loader:
+            train_reader._reset()
+        return dy_out, dy_param_value, dy_grad_value
+
+    def test_with_save_load(self):
+        with fluid.dygraph.guard():
+            out_use_save_load = self.train_resnet(
+                enable_amp=True, use_data_loader=True, use_save_load=True)
+            out_no_save_load = self.train_resnet(
+                enable_amp=True, use_data_loader=True, use_save_load=False)
+        print('save_load:', out_use_save_load[0], out_no_save_load[0])
+        self.assertTrue(np.allclose(out_use_save_load[0], out_no_save_load[0]))
+
+
+class TestPureFp16InferenceSaveLoad(unittest.TestCase):
+    def test_inference_save_load(self):
+        BATCH_SIZE = 16
+        BATCH_NUM = 4
+        EPOCH_NUM = 4
+        IMAGE_SIZE = 784
+        CLASS_NUM = 10
+
+        # define a random dataset
+        class RandomDataset(paddle.io.Dataset):
+            def __init__(self, num_samples):
+                self.num_samples = num_samples
+
+            def __getitem__(self, idx):
+                image = np.random.random([IMAGE_SIZE]).astype('float32')
+                label = np.random.randint(0, CLASS_NUM - 1,
+                                          (1, )).astype('int64')
+                return image, label
+
+            def __len__(self):
+                return self.num_samples
+
+        class LinearNet(nn.Layer):
+            def __init__(self):
+                super(LinearNet, self).__init__()
+                self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+            def forward(self, x):
+                return self._linear(x)
+
+        def train(layer, loader, loss_fn, opt):
+            for epoch_id in range(EPOCH_NUM):
+                for batch_id, (image, label) in enumerate(loader()):
+                    with paddle.amp.auto_cast(
+                            enable=True,
+                            custom_white_list=None,
+                            custom_black_list=None,
+                            level='O2'):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                    loss.backward()
+                    opt.step()
+                    opt.clear_grad()
+
+        # train
+        layer = LinearNet()
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.001,
+            parameters=layer.parameters(),
+            multi_precision=True)
+        loss_fn = nn.CrossEntropyLoss()
+        layer, adam = paddle.amp.decorate(
+            models=layer, optimizers=adam, save_dtype='float32')
+        dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+        loader = paddle.io.DataLoader(
+            dataset,
+            batch_size=BATCH_SIZE,
+            shuffle=True,
+            drop_last=True,
+            num_workers=2)
+
+        train(layer, loader, loss_fn, adam)
+
+        # save 
+        path = "example_model/linear"
+        paddle.jit.save(
+            layer, path, input_spec=[InputSpec(
+                shape=[IMAGE_SIZE], name='x')])
+
+        # jit.load
+        loaded_layer = paddle.jit.load(path)
+
+        # inference
+        loaded_layer.eval()
+        x = np.random.randn(1, IMAGE_SIZE).astype('float32')
+        x_tensor = paddle.to_tensor(x)
+        pred = loaded_layer(x_tensor)
+
+        # load_inference_model
+        paddle.enable_static()
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        [inference_program, feed_target_names, fetch_targets] = (
+            paddle.static.load_inference_model(path, exe))
+        tensor_img = x
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+
+        self.assertTrue(np.allclose(pred.numpy(), results, atol=1.e-5))
+
+
 class TestResnet2(unittest.TestCase):
     """
     Use paddle-2.0 API
@@ -393,6 +829,7 @@ class TestResnet2(unittest.TestCase):
 
     def train_resnet(self,
                      enable_amp=True,
+                     level='O1',
                      use_data_loader=False,
                      use_param_group=False):
         seed = 90
@@ -418,13 +855,15 @@ def train_resnet(self,
             # NOTE(zhiqiu): The Membership test operations(in / not in) calls "is" and "equal",
             # see details: https://docs.python.org/3/reference/expressions.html#membership-test-operations.
             # So do not use other_params =  [p for p in resnet.parameters() if p not in conv_params]
-            optimizer = paddle.optimizer.Momentum(parameters=[{
-                'params': conv_params,
-                'learning_rate': 0.01
-            }, {
-                'params': other_params,
-                'learning_rate': 0.001
-            }])
+            optimizer = paddle.optimizer.Momentum(
+                parameters=[{
+                    'params': conv_params,
+                    'learning_rate': 0.01
+                }, {
+                    'params': other_params,
+                    'learning_rate': 0.001
+                }],
+                multi_precision=True)
         else:
             optimizer = paddle.optimizer.SGD(parameters=resnet.parameters())
 
@@ -453,6 +892,10 @@ def train_resnet(self,
             train_loader.set_sample_list_generator(train_reader)
             train_reader = train_loader
 
+        if enable_amp and (level == 'O2'):
+            resnet, optimizer = paddle.amp.decorate(
+                models=resnet, optimizers=optimizer, level='O2')
+
         for batch_id, data in enumerate(train_reader()):
             if batch_id >= batch_num:
                 break
@@ -471,18 +914,20 @@ def train_resnet(self,
                 label = paddle.to_tensor(y_data)
             label.stop_gradient = True
 
-            with paddle.amp.auto_cast(enable=enable_amp):
+            with paddle.amp.auto_cast(enable=enable_amp, level=level):
                 out = resnet(img)
 
             loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+            loss = paddle.cast(loss, 'float32')
             avg_loss = paddle.mean(x=loss)
 
             dy_out = avg_loss.numpy()
 
             scaled_loss = scaler.scale(avg_loss)
             scaled_loss.backward()
-
+            scaler.unscale_(optimizer)
             scaler.step(optimizer)
+            scaler.update()
 
             dy_grad_value = {}
             for param in resnet.parameters():
@@ -504,15 +949,20 @@ def test_resnet(self):
         with fluid.dygraph.guard():
             out_fp32 = self.train_resnet(enable_amp=False)
             out_amp = self.train_resnet(enable_amp=True)
-        print(out_fp32[0], out_amp[0])
+            out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
+        print(out_fp32[0], out_amp[0], out_pure_fp16[0])
         self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
+        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
 
     def test_with_data_loader(self):
         with fluid.dygraph.guard():
             out_fp32 = self.train_resnet(enable_amp=False, use_data_loader=True)
             out_amp = self.train_resnet(enable_amp=True, use_data_loader=True)
-        print(out_fp32[0], out_amp[0])
+            out_pure_fp16 = self.train_resnet(
+                enable_amp=True, use_data_loader=True, level='O2')
+        print(out_fp32[0], out_amp[0], out_pure_fp16[0])
         self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
+        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
 
     def test_param_group(self):
         with fluid.dygraph.guard():
@@ -520,8 +970,14 @@ def test_param_group(self):
                 enable_amp=False, use_data_loader=True, use_param_group=True)
             out_amp = self.train_resnet(
                 enable_amp=True, use_data_loader=True, use_param_group=True)
-        print(out_fp32[0], out_amp[0])
+            out_pure_fp16 = self.train_resnet(
+                enable_amp=True,
+                use_data_loader=True,
+                use_param_group=True,
+                level='O2')
+        print(out_fp32[0], out_amp[0], out_pure_fp16[0])
         self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
+        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
 
 
 class TestResnet(unittest.TestCase):
@@ -529,7 +985,7 @@ class TestResnet(unittest.TestCase):
     Use paddle-1.x API
     """
 
-    def train_resnet(self, enable_amp=True):
+    def train_resnet(self, enable_amp=True, level='O1'):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
@@ -542,6 +998,8 @@ def train_resnet(self, enable_amp=True):
             resnet = ResNet(use_cudnn=True)
             optimizer = optimizer_setting(
                 train_parameters, parameter_list=resnet.parameters())
+            optimizer = paddle.optimizer.Momentum(
+                parameters=resnet.parameters(), multi_precision=True)
             np.random.seed(seed)
             train_reader = paddle.batch(
                 paddle.dataset.flowers.train(use_xmap=False),
@@ -554,6 +1012,11 @@ def train_resnet(self, enable_amp=True):
             program = None
             scaler = paddle.fluid.dygraph.AmpScaler(
                 enable=enable_amp, init_loss_scaling=2.**10)
+
+            if enable_amp and (level == 'O2'):
+                resnet, optimizer = paddle.fluid.dygraph.amp_decorate(
+                    models=resnet, optimizers=optimizer, level='O2')
+
             for batch_id, data in enumerate(train_reader()):
                 if batch_id >= batch_num:
                     break
@@ -567,7 +1030,8 @@ def train_resnet(self, enable_amp=True):
                 img = fluid.dygraph.to_variable(dy_x_data)
                 label = fluid.dygraph.to_variable(y_data)
                 label.stop_gradient = True
-                with paddle.fluid.dygraph.amp_guard(enable=enable_amp):
+                with paddle.fluid.dygraph.amp_guard(
+                        enable=enable_amp, level=level):
                     out = resnet(img)
 
                 loss = fluid.layers.cross_entropy(input=out, label=label)
@@ -599,8 +1063,10 @@ def train_resnet(self, enable_amp=True):
     def test_resnet(self):
         out_fp32 = self.train_resnet(enable_amp=False)
         out_amp = self.train_resnet(enable_amp=True)
-        print(out_fp32[0], out_amp[0])
+        out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
+        print(out_fp32[0], out_amp[0], out_pure_fp16[0])
         self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
+        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1))
 
 
 class TestLayerNormFp16(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 1d24687a6b199..fc58f979b4dc2 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -1099,7 +1099,6 @@ def test_save_load_finetune_load(self):
                 paddle.static.InputSpec(
                     shape=[None, IMAGE_SIZE], dtype='float32')
             ])
-
         result_00 = layer_save(inps0)
         result_01 = layer_save(inps1)
         #load and save without running
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py b/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py
new file mode 100644
index 0000000000000..7ff6ebc0437b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py
@@ -0,0 +1,251 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+from gradient_checker import grad_check
+from decorator_helper import prog_scope
+
+
+class LinalgPinvTestCase(unittest.TestCase):
+    def setUp(self):
+        self.init_config()
+        self.generate_input()
+        self.generate_output()
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def generate_input(self):
+        self._input_shape = (5, 5)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+    def generate_output(self):
+        self._output_data = np.linalg.pinv(self._input_data, \
+            rcond=self.rcond, hermitian=self.hermitian)
+
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.hermitian = False
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self._input_data, place=place)
+            out = paddle.linalg.pinv(
+                x, rcond=self.rcond, hermitian=self.hermitian).numpy()
+            if (np.abs(out - self._output_data) < 1e-6).any():
+                pass
+            else:
+                print("EXPECTED: \n", self._output_data)
+                print("GOT     : \n", out)
+                raise RuntimeError("Check PINV dygraph Failed")
+
+    def test_static(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                x = paddle.fluid.data(
+                    name="input",
+                    shape=self._input_shape,
+                    dtype=self._input_data.dtype)
+                out = paddle.linalg.pinv(
+                    x, rcond=self.rcond, hermitian=self.hermitian)
+                exe = fluid.Executor(place)
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": self._input_data},
+                                  fetch_list=[out])
+                if (np.abs(fetches[0] - self._output_data) < 1e-6).any():
+                    pass
+                else:
+                    print("EXPECTED: \n", self._output_data)
+                    print("GOT     : \n", fetches[0])
+                    raise RuntimeError("Check PINV static Failed")
+
+    def test_grad(self):
+        for place in self.places:
+            x = paddle.to_tensor(
+                self._input_data, place=place, stop_gradient=False)
+            out = paddle.linalg.pinv(
+                x, rcond=self.rcond, hermitian=self.hermitian)
+            try:
+                out.backward()
+                x_grad = x.grad
+                # print(x_grad)
+            except:
+                raise RuntimeError("Check PINV Grad Failed")
+
+
+class LinalgPinvTestCase1(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (4, 5)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+
+class LinalgPinvTestCase2(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (5, 4)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+
+class LinalgPinvTestCaseBatch1(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 5, 5)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+
+class LinalgPinvTestCaseBatch2(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 4, 5)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+
+class LinalgPinvTestCaseBatch3(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 5, 4)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+
+class LinalgPinvTestCaseBatch4(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 6, 5, 4)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+
+class LinalgPinvTestCaseBatchBig(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 200, 300)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+
+class LinalgPinvTestCaseFP32(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 5, 5)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = 1e-15
+        self.hermitian = False
+
+
+class LinalgPinvTestCaseRcond(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 5, 5)
+        self._input_data = np.random.random(self._input_shape).astype(
+            self.dtype)
+
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-10
+        self.hermitian = False
+
+
+class LinalgPinvTestCaseHermitian1(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (5, 5)
+        x = np.random.random(self._input_shape).astype(self.dtype) + \
+            1J * np.random.random(self._input_shape).astype(self.dtype)
+        self._input_data = x + x.transpose().conj()
+
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.hermitian = True
+
+
+class LinalgPinvTestCaseHermitian2(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 5, 5)
+        x = np.random.random(self._input_shape).astype(self.dtype) + \
+            1J * np.random.random(self._input_shape).astype(self.dtype)
+        self._input_data = x + x.transpose((0, 2, 1)).conj()
+
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.hermitian = True
+
+
+class LinalgPinvTestCaseHermitian3(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 5, 5)
+        x = np.random.random(self._input_shape).astype(self.dtype) + \
+            1J * np.random.random(self._input_shape).astype(self.dtype)
+        self._input_data = x + x.transpose((0, 2, 1)).conj()
+
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = 1e-15
+        self.hermitian = True
+
+
+class LinalgPinvTestCaseHermitian4(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (5, 5)
+        x = np.random.random(self._input_shape).astype(self.dtype)
+        self._input_data = x + x.transpose()
+
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.hermitian = True
+
+
+class LinalgPinvTestCaseHermitian5(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 5, 5)
+        x = np.random.random(self._input_shape).astype(self.dtype)
+        self._input_data = x + x.transpose((0, 2, 1))
+
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.hermitian = True
+
+
+class LinalgPinvTestCaseHermitianFP32(LinalgPinvTestCase):
+    def generate_input(self):
+        self._input_shape = (3, 5, 5)
+        x = np.random.random(self._input_shape).astype(self.dtype)
+        self._input_data = x + x.transpose((0, 2, 1))
+
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = 1e-15
+        self.hermitian = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py
index 59f38d7cad691..9d07a80da15db 100644
--- a/python/paddle/fluid/tests/unittests/test_linear.py
+++ b/python/paddle/fluid/tests/unittests/test_linear.py
@@ -73,15 +73,6 @@ def test_error(self, place=paddle.CPUPlace()):
         np.testing.assert_array_almost_equal(res_f, res_nn)
         np.testing.assert_array_almost_equal(res_nn, res_np)
 
-    def test_error_dummy_input(self, place=paddle.CPUPlace()):
-        with self.assertRaises(RuntimeError):
-            x_arr = np.array([], dtype=np.float32)
-            x = paddle.to_tensor(
-                np.reshape(x_arr, (0, 4, 4, 4)), dtype='float32')
-            weight = paddle.zeros([4, 4, 4], dtype='float32')
-            bias = paddle.to_tensor([], dtype='float32')
-            paddle.nn.functional.linear(x, weight, bias=bias)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index 97047b1ae0e5e..8856624b4efc7 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -198,32 +198,34 @@ def test_errors(self):
                                          paddle.static.Program()):
             # The inputs type of multi_dot must be list matrix.
             input1 = 12
-            self.assertRaises(TypeError, paddle.multi_dot, [input1, input1])
+            self.assertRaises(TypeError, paddle.linalg.multi_dot,
+                              [input1, input1])
 
             # The inputs dtype of multi_dot must be float64, float64 or float16.
             input2 = paddle.static.data(
                 name='input2', shape=[10, 10], dtype="int32")
-            self.assertRaises(TypeError, paddle.multi_dot, [input2, input2])
+            self.assertRaises(TypeError, paddle.linalg.multi_dot,
+                              [input2, input2])
 
             # the number of tensor must be larger than 1
             x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64")
-            self.assertRaises(ValueError, paddle.multi_dot, [x0])
+            self.assertRaises(ValueError, paddle.linalg.multi_dot, [x0])
 
             #the first tensor must be 1D or 2D
             x1 = paddle.static.data(name='x1', shape=[3, 2, 3], dtype="float64")
             x2 = paddle.static.data(name='x2', shape=[3, 2], dtype="float64")
-            self.assertRaises(ValueError, paddle.multi_dot, [x1, x2])
+            self.assertRaises(ValueError, paddle.linalg.multi_dot, [x1, x2])
 
             #the last tensor must be 1D or 2D
             x3 = paddle.static.data(name='x3', shape=[3, 2], dtype="float64")
             x4 = paddle.static.data(name='x4', shape=[3, 2, 2], dtype="float64")
-            self.assertRaises(ValueError, paddle.multi_dot, [x3, x4])
+            self.assertRaises(ValueError, paddle.linalg.multi_dot, [x3, x4])
 
             #the tensor must be 2D, except first and last tensor
             x5 = paddle.static.data(name='x5', shape=[3, 2], dtype="float64")
             x6 = paddle.static.data(name='x6', shape=[2], dtype="float64")
             x7 = paddle.static.data(name='x7', shape=[2, 2], dtype="float64")
-            self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7])
+            self.assertRaises(ValueError, paddle.linalg.multi_dot, [x5, x6, x7])
 
 
 class APITestMultiDot(unittest.TestCase):
@@ -232,7 +234,7 @@ def test_out(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64")
             x1 = paddle.static.data(name='x1', shape=[2, 3], dtype='float64')
-            result = paddle.multi_dot([x0, x1])
+            result = paddle.linalg.multi_dot([x0, x1])
             exe = paddle.static.Executor(paddle.CPUPlace())
             data1 = np.random.rand(3, 2).astype("float64")
             data2 = np.random.rand(2, 3).astype("float64")
@@ -254,7 +256,7 @@ def test_dygraph_without_out(self):
         input_array2 = np.random.rand(4, 3).astype("float64")
         data1 = paddle.to_tensor(input_array1)
         data2 = paddle.to_tensor(input_array2)
-        out = paddle.multi_dot([data1, data2])
+        out = paddle.linalg.multi_dot([data1, data2])
         expected_result = np.linalg.multi_dot([input_array1, input_array2])
         self.assertTrue(np.allclose(expected_result, out.numpy()))
 
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index af81d15717a70..6b424e03cc243 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -41,10 +41,10 @@ def setUp(self):
         self.outputs = {'Out': y, 'Norm': norm}
 
     def test_check_output(self):
-        self.check_output(atol=1e-5)
+        self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out')
 
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
@@ -97,6 +97,9 @@ class TestNormOp6(TestNormOp):
     def init_dtype(self):
         self.dtype = "float32"
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+
 
 @unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_overlap_add_op.py b/python/paddle/fluid/tests/unittests/test_overlap_add_op.py
new file mode 100644
index 0000000000000..7af67d01b573e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_overlap_add_op.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import unittest
+
+from op_test import OpTest
+
+
+def overlap_add(x, hop_length, axis=-1):
+    assert axis in [0, -1], 'axis should be 0/-1.'
+    assert len(x.shape) >= 2, 'Input dims shoulb be >= 2.'
+
+    squeeze_output = False
+    if len(x.shape) == 2:
+        squeeze_output = True
+        dim = 0 if axis == -1 else -1
+        x = np.expand_dims(x, dim)  # batch
+
+    n_frames = x.shape[axis]
+    frame_length = x.shape[1] if axis == 0 else x.shape[-2]
+
+    # Assure no gaps between frames.
+    assert 0 < hop_length <= frame_length, \
+        f'hop_length should be in (0, frame_length({frame_length})], but got {hop_length}.'
+
+    seq_length = (n_frames - 1) * hop_length + frame_length
+
+    reshape_output = False
+    if len(x.shape) > 3:
+        reshape_output = True
+        if axis == 0:
+            target_shape = [seq_length] + list(x.shape[2:])
+            x = x.reshape(n_frames, frame_length, np.product(x.shape[2:]))
+        else:
+            target_shape = list(x.shape[:-2]) + [seq_length]
+            x = x.reshape(np.product(x.shape[:-2]), frame_length, n_frames)
+
+    if axis == 0:
+        x = x.transpose((2, 1, 0))
+
+    y = np.zeros(shape=[np.product(x.shape[:-2]), seq_length], dtype=x.dtype)
+    for i in range(x.shape[0]):
+        for frame in range(x.shape[-1]):
+            sample = frame * hop_length
+            y[i, sample:sample + frame_length] += x[i, :, frame]
+
+    if axis == 0:
+        y = y.transpose((1, 0))
+
+    if reshape_output:
+        y = y.reshape(target_shape)
+
+    if squeeze_output:
+        y = y.squeeze(-1) if axis == 0 else y.squeeze(0)
+
+    return y
+
+
+class TestOverlapAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "overlap_add"
+        self.shape, self.type, self.attrs = self.initTestCase()
+        self.inputs = {
+            'X': np.random.random(size=self.shape).astype(self.type),
+        }
+        self.outputs = {'Out': overlap_add(x=self.inputs['X'], **self.attrs)}
+
+    def initTestCase(self):
+        input_shape = (50, 3)
+        input_type = 'float64'
+        attrs = {
+            'hop_length': 4,
+            'axis': -1,
+        }
+        return input_shape, input_type, attrs
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+        paddle.disable_static()
+
+    def test_check_grad_normal(self):
+        paddle.enable_static()
+        self.check_grad(['X'], 'Out')
+        paddle.disable_static()
+
+
+class TestCase1(TestOverlapAddOp):
+    def initTestCase(self):
+        input_shape = (3, 50)
+        input_type = 'float64'
+        attrs = {
+            'hop_length': 4,
+            'axis': 0,
+        }
+        return input_shape, input_type, attrs
+
+
+class TestCase2(TestOverlapAddOp):
+    def initTestCase(self):
+        input_shape = (2, 40, 5)
+        input_type = 'float64'
+        attrs = {
+            'hop_length': 10,
+            'axis': -1,
+        }
+        return input_shape, input_type, attrs
+
+
+class TestCase3(TestOverlapAddOp):
+    def initTestCase(self):
+        input_shape = (5, 40, 2)
+        input_type = 'float64'
+        attrs = {
+            'hop_length': 10,
+            'axis': 0,
+        }
+        return input_shape, input_type, attrs
+
+
+class TestCase4(TestOverlapAddOp):
+    def initTestCase(self):
+        input_shape = (3, 5, 12, 8)
+        input_type = 'float64'
+        attrs = {
+            'hop_length': 5,
+            'axis': -1,
+        }
+        return input_shape, input_type, attrs
+
+
+class TestCase5(TestOverlapAddOp):
+    def initTestCase(self):
+        input_shape = (8, 12, 5, 3)
+        input_type = 'float64'
+        attrs = {
+            'hop_length': 5,
+            'axis': 0,
+        }
+        return input_shape, input_type, attrs
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
index 066bcf48612c5..95b8c5c3c0a94 100644
--- a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
@@ -14,18 +14,89 @@
 
 from __future__ import print_function
 
+import paddle
 import math
 import numpy as np
 import unittest
 from op_test import OpTest
 
 
+def calc_psroi_pool(x, rois, rois_num_per_img, output_channels, spatial_scale,
+                    pooled_height, pooled_width):
+    """
+    Psroi_pool implemented by Numpy.
+    x: 4-D as (N, C, H, W),
+    rois: 2-D as [[x1, y1, x2, y2], ...],
+    rois_num_per_img: 1-D as [nums_of_batch_0, nums_of_batch_1,  ...]
+    """
+    output_shape = (len(rois), output_channels, pooled_height, pooled_width)
+    out_data = np.zeros(output_shape)
+    batch_id = 0
+    rois_num_id = 0
+    rois_num_left = rois_num_per_img[rois_num_id]
+    for i in range(len(rois)):
+        roi = rois[i]
+        roi_batch_id = batch_id
+        rois_num_left -= 1
+        if rois_num_left == 0:
+            rois_num_id += 1
+            if rois_num_id < len(rois_num_per_img):
+                rois_num_left = rois_num_per_img[rois_num_id]
+            batch_id += 1
+        roi_start_w = round(roi[0]) * spatial_scale
+        roi_start_h = round(roi[1]) * spatial_scale
+        roi_end_w = (round(roi[2]) + 1.) * spatial_scale
+        roi_end_h = (round(roi[3]) + 1.) * spatial_scale
+
+        roi_height = max(roi_end_h - roi_start_h, 0.1)
+        roi_width = max(roi_end_w - roi_start_w, 0.1)
+
+        bin_size_h = roi_height / float(pooled_height)
+        bin_size_w = roi_width / float(pooled_width)
+
+        x_i = x[roi_batch_id]
+
+        for c in range(output_channels):
+            for ph in range(pooled_height):
+                for pw in range(pooled_width):
+                    hstart = int(
+                        math.floor(float(ph) * bin_size_h + roi_start_h))
+                    wstart = int(
+                        math.floor(float(pw) * bin_size_w + roi_start_w))
+                    hend = int(
+                        math.ceil(float(ph + 1) * bin_size_h + roi_start_h))
+                    wend = int(
+                        math.ceil(float(pw + 1) * bin_size_w + roi_start_w))
+                    hstart = min(max(hstart, 0), x.shape[2])
+                    hend = min(max(hend, 0), x.shape[2])
+                    wstart = min(max(wstart, 0), x.shape[3])
+                    wend = min(max(wend, 0), x.shape[3])
+
+                    c_in = (c * pooled_height + ph) * pooled_width + pw
+                    is_empty = (hend <= hstart) or (wend <= wstart)
+                    out_sum = 0.
+                    for ih in range(hstart, hend):
+                        for iw in range(wstart, wend):
+                            out_sum += x_i[c_in, ih, iw]
+                    bin_area = (hend - hstart) * (wend - wstart)
+                    out_data[i, c, ph, pw] = 0. if is_empty else (
+                        out_sum / float(bin_area))
+    return out_data
+
+
 class TestPSROIPoolOp(OpTest):
     def set_data(self):
+        paddle.enable_static()
         self.init_test_case()
         self.make_rois()
-        self.calc_psroi_pool()
-        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
+        self.outs = calc_psroi_pool(self.x, self.boxes, self.boxes_num,
+                                    self.output_channels, self.spatial_scale,
+                                    self.pooled_height,
+                                    self.pooled_width).astype('float64')
+        self.inputs = {
+            'X': self.x,
+            'ROIs': (self.rois_with_batch_id[:, 1:5], self.rois_lod)
+        }
         self.attrs = {
             'output_channels': self.output_channels,
             'spatial_scale': self.spatial_scale,
@@ -67,57 +138,10 @@ def make_rois(self):
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
         self.rois_num = len(rois)
-        self.rois = np.array(rois).astype('float64')
-
-    def calc_psroi_pool(self):
-        output_shape = (self.rois_num, self.output_channels, self.pooled_height,
-                        self.pooled_width)
-        out_data = np.zeros(output_shape)
-        for i in range(self.rois_num):
-            roi = self.rois[i]
-            roi_batch_id = int(roi[0])
-            roi_start_w = round(roi[1]) * self.spatial_scale
-            roi_start_h = round(roi[2]) * self.spatial_scale
-            roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale
-            roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale
-
-            roi_height = max(roi_end_h - roi_start_h, 0.1)
-            roi_width = max(roi_end_w - roi_start_w, 0.1)
-
-            bin_size_h = roi_height / float(self.pooled_height)
-            bin_size_w = roi_width / float(self.pooled_width)
-
-            x_i = self.x[roi_batch_id]
-
-            for c in range(self.output_channels):
-                for ph in range(self.pooled_height):
-                    for pw in range(self.pooled_width):
-                        hstart = int(
-                            math.floor(float(ph) * bin_size_h + roi_start_h))
-                        wstart = int(
-                            math.floor(float(pw) * bin_size_w + roi_start_w))
-                        hend = int(
-                            math.ceil(
-                                float(ph + 1) * bin_size_h + roi_start_h))
-                        wend = int(
-                            math.ceil(
-                                float(pw + 1) * bin_size_w + roi_start_w))
-                        hstart = min(max(hstart, 0), self.height)
-                        hend = min(max(hend, 0), self.height)
-                        wstart = min(max(wstart, 0), self.width)
-                        wend = min(max(wend, 0), self.width)
-
-                        c_in = (c * self.pooled_height + ph
-                                ) * self.pooled_width + pw
-                        is_empty = (hend <= hstart) or (wend <= wstart)
-                        out_sum = 0.
-                        for ih in range(hstart, hend):
-                            for iw in range(wstart, wend):
-                                out_sum += x_i[c_in, ih, iw]
-                        bin_area = (hend - hstart) * (wend - wstart)
-                        out_data[i, c, ph, pw] = 0. if is_empty else (
-                            out_sum / float(bin_area))
-        self.outs = out_data.astype('float64')
+        self.rois_with_batch_id = np.array(rois).astype('float64')
+        self.boxes = self.rois_with_batch_id[:, 1:]
+        self.boxes_num = np.array(
+            [bno + 1 for bno in range(self.batch_size)]).astype('int32')
 
     def setUp(self):
         self.op_type = 'psroi_pool'
@@ -130,5 +154,175 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+class TestPSROIPoolDynamicFunctionAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.random([2, 490, 28, 28]).astype(np.float32)
+        self.boxes = np.array(
+            [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]]).astype(np.float32)
+        self.boxes_num = np.array([1, 2]).astype(np.int32)
+
+    def test_output_size(self):
+        def test_output_size_is_int():
+            output_size = 7
+            out = paddle.vision.ops.psroi_pool(
+                paddle.to_tensor(self.x),
+                paddle.to_tensor(self.boxes),
+                paddle.to_tensor(self.boxes_num), output_size).numpy()
+            expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10,
+                                         1.0, 7, 7)
+            self.assertTrue(np.allclose(out, expect_out))
+
+        def test_output_size_is_tuple():
+            output_size = (7, 7)
+            out = paddle.vision.ops.psroi_pool(
+                paddle.to_tensor(self.x),
+                paddle.to_tensor(self.boxes),
+                paddle.to_tensor(self.boxes_num), output_size).numpy()
+            expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10,
+                                         1.0, 7, 7)
+            self.assertTrue(np.allclose(out, expect_out))
+
+        def test_dytype_is_float64():
+            output_size = (7, 7)
+            out = paddle.vision.ops.psroi_pool(
+                paddle.to_tensor(self.x, 'float64'),
+                paddle.to_tensor(self.boxes, 'float64'),
+                paddle.to_tensor(self.boxes_num, 'int32'), output_size).numpy()
+            expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10,
+                                         1.0, 7, 7)
+            self.assertTrue(np.allclose(out, expect_out))
+
+        places = ['cpu']
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            test_output_size_is_int()
+            test_output_size_is_tuple()
+            test_dytype_is_float64()
+
+
+class TestPSROIPoolDynamicClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.random([2, 128, 32, 32]).astype(np.float32)
+        self.boxes = np.array([[3, 5, 6, 13], [7, 4, 22, 18], [4, 5, 7, 10],
+                               [5, 3, 25, 21]]).astype(np.float32)
+        self.boxes_num = np.array([2, 2]).astype(np.int32)
+
+    def test_output_size(self):
+        def test_output_size_is_int():
+            psroi_module = paddle.vision.ops.PSRoIPool(8, 1.1)
+            out = psroi_module(
+                paddle.to_tensor(self.x),
+                paddle.to_tensor(self.boxes),
+                paddle.to_tensor(self.boxes_num)).numpy()
+            expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2,
+                                         1.1, 8, 8)
+            self.assertTrue(np.allclose(out, expect_out))
+
+        def test_output_size_is_tuple():
+            psroi_pool_module = paddle.vision.ops.PSRoIPool(8, 1.1)
+            out = psroi_pool_module(
+                paddle.to_tensor(self.x),
+                paddle.to_tensor(self.boxes),
+                paddle.to_tensor(self.boxes_num)).numpy()
+            expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2,
+                                         1.1, 8, 8)
+            self.assertTrue(np.allclose(out, expect_out))
+
+        def test_dytype_is_float64():
+            psroi_pool_module = paddle.vision.ops.PSRoIPool(8, 1.1)
+            out = psroi_pool_module(
+                paddle.to_tensor(self.x, 'float64'),
+                paddle.to_tensor(self.boxes, 'float64'),
+                paddle.to_tensor(self.boxes_num, 'int32')).numpy()
+            expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2,
+                                         1.1, 8, 8)
+            self.assertTrue(np.allclose(out, expect_out))
+
+        paddle.disable_static()
+        places = ['cpu']
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            test_output_size_is_int()
+            test_output_size_is_tuple()
+            test_dytype_is_float64()
+
+
+class TestPSROIPoolBoxesNumError(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.x = paddle.uniform([2, 490, 28, 28], dtype='float32')
+        self.boxes = paddle.to_tensor(
+            [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], 'float32')
+
+    def test_errors(self):
+        def test_boxes_num_nums_error():
+            boxes_num = paddle.to_tensor([1, 5], 'int32')
+            out = paddle.vision.ops.psroi_pool(
+                self.x, self.boxes, boxes_num, output_size=7)
+
+        self.assertRaises(ValueError, test_boxes_num_nums_error)
+
+        def test_boxes_num_length_error():
+            boxes_num = paddle.to_tensor([1, 1, 1], 'int32')
+            out = paddle.vision.ops.psroi_pool(
+                self.x, self.boxes, boxes_num, output_size=7)
+
+        self.assertRaises(ValueError, test_boxes_num_length_error)
+
+
+class TestPSROIPoolChannelError(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.x = paddle.uniform([2, 490, 28, 28], dtype='float32')
+        self.boxes = paddle.to_tensor(
+            [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], 'float32')
+        self.output_size = 4
+
+    def test_errors(self):
+        def test_channel_error():
+            boxes_num = paddle.to_tensor([2, 1], 'int32')
+            out = paddle.vision.ops.psroi_pool(self.x, self.boxes, boxes_num,
+                                               self.output_size)
+
+        self.assertRaises(ValueError, test_channel_error)
+
+
+class TestPSROIPoolStaticAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.x_placeholder = paddle.static.data(
+            name='x', shape=[2, 490, 28, 28])
+        self.x = np.random.random([2, 490, 28, 28]).astype(np.float32)
+        self.boxes_placeholder = paddle.static.data(
+            name='boxes', shape=[3, 4], lod_level=1)
+        self.boxes = np.array(
+            [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]]).astype(np.float32)
+        self.boxes_num = np.array([1, 2]).astype(np.int32)
+
+    def test_function_in_static(self):
+        output_size = 7
+        out = paddle.vision.ops.psroi_pool(self.x_placeholder,
+                                           self.boxes_placeholder,
+                                           self.boxes_num, output_size)
+        expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10,
+                                     1.0, 7, 7)
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            exe = paddle.static.Executor(place)
+            boxes_lod_data = paddle.fluid.create_lod_tensor(self.boxes,
+                                                            [[1, 2]], place)
+            out_res = exe.run(paddle.static.default_main_program(),
+                              feed={'x': self.x,
+                                    'boxes': boxes_lod_data},
+                              fetch_list=[out.name])
+            self.assertTrue(np.allclose(out_res, expect_out))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index 8b65fc4e431f9..a468d6e828ce1 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -32,6 +32,7 @@ class TestSGDOpBF16(OpTest):
     def setUp(self):
         self.op_type = 'sgd'
         self.dtype = np.uint16
+        self.use_mkldnn = True
         self.conf()
         w = np.random.random((self.h, self.w)).astype('float32')
         w_bf16 = convert_float_to_uint16(w)
@@ -42,6 +43,7 @@ def setUp(self):
 
         self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16}
         self.outputs = {'ParamOut': w - lr * g}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
 
     def conf(self):
         self.h = 102
@@ -53,7 +55,7 @@ def test_check_output(self):
 
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
-class TestSGDOpCase8XBF16(TestSGDOpBF16):
+class TestSGDOpBF16Case2(TestSGDOpBF16):
     def conf(self):
         self.h = 10
         self.w = 64
@@ -142,7 +144,8 @@ def test_sparse_grad_sgd(self):
             Param='Param',
             Grad='Grad',
             ParamOut='Param',
-            LearningRate='LearningRate')
+            LearningRate='LearningRate',
+            use_mkldnn=True)
         sgd_op.run(scope, place)
 
         reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
@@ -194,7 +197,8 @@ def test_sparse_param_grad_sgd(self):
             Param='Param',
             Grad='Grad',
             ParamOut='Param',
-            LearningRate='LearningRate')
+            LearningRate='LearningRate',
+            use_mkldnn=True)
         sgd_op.run(scope, place)
 
         reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
@@ -213,6 +217,11 @@ def setup_params(self):
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestSGDOpBF16API(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        np.random.seed(12345)
+        fluid.set_flags({'FLAGS_use_mkldnn': True})
+
     def setUp(self):
         self.sample_count = 20
         self.value = np.random.random()
@@ -222,9 +231,7 @@ def setUp(self):
         self.y_shape = (32, 16)
         self.learning_rate = 0.1
 
-        np.random.seed(12345)
         self._set_initializer()
-        fluid.set_flags({'FLAGS_use_mkldnn': True})
 
     def _fp322bf16(self, val: np.float32):
         return np.uint16(struct.unpack('<I', struct.pack('<f', val))[0] >> 16)
diff --git a/python/paddle/fluid/tests/unittests/test_signal.py b/python/paddle/fluid/tests/unittests/test_signal.py
new file mode 100644
index 0000000000000..a109a5aa5d1a6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_signal.py
@@ -0,0 +1,1005 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+import unittest
+
+import numpy as np
+from numpy import fft
+from numpy.lib.stride_tricks import as_strided
+import paddle
+import scipy.signal
+
+paddle.set_default_dtype('float64')
+
+DEVICES = [paddle.CPUPlace()]
+if paddle.is_compiled_with_cuda():
+    DEVICES.append(paddle.CUDAPlace(0))
+TEST_CASE_NAME = 'test_case'
+
+# Constrain STFT block sizes to 256 KB
+MAX_MEM_BLOCK = 2**8 * 2**10
+
+
+def fix_length(data, size, axis=-1, **kwargs):
+    kwargs.setdefault("mode", "constant")
+
+    n = data.shape[axis]
+
+    if n > size:
+        slices = [slice(None)] * data.ndim
+        slices[axis] = slice(0, size)
+        return data[tuple(slices)]
+
+    elif n < size:
+        lengths = [(0, 0)] * data.ndim
+        lengths[axis] = (0, size - n)
+        return np.pad(data, lengths, **kwargs)
+
+    return data
+
+
+def tiny(x):
+    # Make sure we have an array view
+    x = np.asarray(x)
+
+    # Only floating types generate a tiny
+    if np.issubdtype(x.dtype, np.floating) or np.issubdtype(x.dtype,
+                                                            np.complexfloating):
+        dtype = x.dtype
+    else:
+        dtype = np.float32
+
+    return np.finfo(dtype).tiny
+
+
+def normalize(S, norm=np.inf, axis=0, threshold=None, fill=None):
+    # Avoid div-by-zero
+    if threshold is None:
+        threshold = tiny(S)
+
+    elif threshold <= 0:
+        raise Exception("threshold={} must be strictly "
+                        "positive".format(threshold))
+
+    if fill not in [None, False, True]:
+        raise Exception("fill={} must be None or boolean".format(fill))
+
+    if not np.all(np.isfinite(S)):
+        raise Exception("Input must be finite")
+
+    # All norms only depend on magnitude, let's do that first
+    mag = np.abs(S).astype(np.float)
+
+    # For max/min norms, filling with 1 works
+    fill_norm = 1
+
+    if norm == np.inf:
+        length = np.max(mag, axis=axis, keepdims=True)
+
+    elif norm == -np.inf:
+        length = np.min(mag, axis=axis, keepdims=True)
+
+    elif norm == 0:
+        if fill is True:
+            raise Exception("Cannot normalize with norm=0 and fill=True")
+
+        length = np.sum(mag > 0, axis=axis, keepdims=True, dtype=mag.dtype)
+
+    elif np.issubdtype(type(norm), np.number) and norm > 0:
+        length = np.sum(mag**norm, axis=axis, keepdims=True)**(1.0 / norm)
+
+        if axis is None:
+            fill_norm = mag.size**(-1.0 / norm)
+        else:
+            fill_norm = mag.shape[axis]**(-1.0 / norm)
+
+    elif norm is None:
+        return S
+
+    else:
+        raise Exception("Unsupported norm: {}".format(repr(norm)))
+
+    # indices where norm is below the threshold
+    small_idx = length < threshold
+
+    Snorm = np.empty_like(S)
+    if fill is None:
+        # Leave small indices un-normalized
+        length[small_idx] = 1.0
+        Snorm[:] = S / length
+
+    elif fill:
+        # If we have a non-zero fill value, we locate those entries by
+        # doing a nan-divide.
+        # If S was finite, then length is finite (except for small positions)
+        length[small_idx] = np.nan
+        Snorm[:] = S / length
+        Snorm[np.isnan(Snorm)] = fill_norm
+    else:
+        # Set small values to zero by doing an inf-divide.
+        # This is safe (by IEEE-754) as long as S is finite.
+        length[small_idx] = np.inf
+        Snorm[:] = S / length
+
+    return Snorm
+
+
+def __window_ss_fill(x, win_sq, n_frames, hop_length):  # pragma: no cover
+    """Helper function for window sum-square calculation."""
+
+    n = len(x)
+    n_fft = len(win_sq)
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample:min(n, sample + n_fft)] += win_sq[:max(0,
+                                                        min(n_fft, n - sample))]
+
+
+def window_sumsquare(
+        window,
+        n_frames,
+        hop_length=512,
+        win_length=None,
+        n_fft=2048,
+        dtype=np.float32,
+        norm=None, ):
+    if win_length is None:
+        win_length = n_fft
+
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length)
+    win_sq = normalize(win_sq, norm=norm)**2
+    win_sq = pad_center(win_sq, n_fft)
+
+    # Fill the envelope
+    __window_ss_fill(x, win_sq, n_frames, hop_length)
+
+    return x
+
+
+def dtype_c2r(d, default=np.float32):
+    mapping = {
+        np.dtype(np.complex64): np.float32,
+        np.dtype(np.complex128): np.float64,
+    }
+
+    # If we're given a real type already, return it
+    dt = np.dtype(d)
+    if dt.kind == "f":
+        return dt
+
+    # Otherwise, try to map the dtype.
+    # If no match is found, return the default.
+    return np.dtype(mapping.get(np.dtype(d), default))
+
+
+def dtype_r2c(d, default=np.complex64):
+    mapping = {
+        np.dtype(np.float32): np.complex64,
+        np.dtype(np.float64): np.complex128,
+    }
+
+    # If we're given a complex type already, return it
+    dt = np.dtype(d)
+    if dt.kind == "c":
+        return dt
+
+    # Otherwise, try to map the dtype.
+    # If no match is found, return the default.
+    return np.dtype(mapping.get(dt, default))
+
+
+def frame(x, frame_length, hop_length, axis=-1):
+    if not isinstance(x, np.ndarray):
+        raise Exception("Input must be of type numpy.ndarray, "
+                        "given type(x)={}".format(type(x)))
+
+    if x.shape[axis] < frame_length:
+        raise Exception("Input is too short (n={:d})"
+                        " for frame_length={:d}".format(x.shape[axis],
+                                                        frame_length))
+
+    if hop_length < 1:
+        raise Exception("Invalid hop_length: {:d}".format(hop_length))
+
+    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
+        print("librosa.util.frame called with axis={} "
+              "on a non-contiguous input. This will result in a copy.".format(
+                  axis))
+        x = np.asfortranarray(x)
+    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
+        print("librosa.util.frame called with axis={} "
+              "on a non-contiguous input. This will result in a copy.".format(
+                  axis))
+        x = np.ascontiguousarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * new_stride]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * new_stride] + list(strides)
+
+    else:
+        raise Exception("Frame axis={} must be either 0 or -1".format(axis))
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def pad_center(data, size, axis=-1, **kwargs):
+    kwargs.setdefault("mode", "constant")
+
+    n = data.shape[axis]
+
+    lpad = int((size - n) // 2)
+
+    lengths = [(0, 0)] * data.ndim
+    lengths[axis] = (lpad, int(size - n - lpad))
+
+    if lpad < 0:
+        raise Exception(("Target size ({:d}) must be "
+                         "at least input size ({:d})").format(size, n))
+
+    return np.pad(data, lengths, **kwargs)
+
+
+def get_window(window, Nx, fftbins=True):
+    if callable(window):
+        return window(Nx)
+
+    elif isinstance(window, (str, tuple)) or np.isscalar(window):
+        # TODO: if we add custom window functions in librosa, call them here
+
+        return scipy.signal.get_window(window, Nx, fftbins=fftbins)
+
+    elif isinstance(window, (np.ndarray, list)):
+        if len(window) == Nx:
+            return np.asarray(window)
+
+        raise Exception("Window size mismatch: "
+                        "{:d} != {:d}".format(len(window), Nx))
+    else:
+        raise Exception("Invalid window specification: {}".format(window))
+
+
+def __overlap_add(y, ytmp, hop_length):
+    # numba-accelerated overlap add for inverse stft
+    # y is the pre-allocated output buffer
+    # ytmp is the windowed inverse-stft frames
+    # hop_length is the hop-length of the STFT analysis
+
+    n_fft = ytmp.shape[0]
+    for frame in range(ytmp.shape[1]):
+        sample = frame * hop_length
+        y[sample:(sample + n_fft)] += ytmp[:, frame]
+
+
+def stft(x,
+         n_fft=2048,
+         hop_length=None,
+         win_length=None,
+         window="hann",
+         center=True,
+         pad_mode="reflect"):
+    y = x
+    input_rank = len(y.shape)
+    if input_rank == 2:
+        assert y.shape[0] == 1  # Only 1d input supported in librosa
+        y = y.squeeze(0)
+    dtype = None
+
+    # By default, use the entire frame
+    if win_length is None:
+        win_length = n_fft
+
+    # Set the default hop, if it's not already specified
+    if hop_length is None:
+        hop_length = int(win_length // 4)
+
+    fft_window = get_window(window, win_length, fftbins=True)
+
+    # Pad the window out to n_fft size
+    fft_window = pad_center(fft_window, n_fft)
+
+    # Reshape so that the window can be broadcast
+    fft_window = fft_window.reshape((-1, 1))
+
+    # Pad the time series so that frames are centered
+    if center:
+        if n_fft > y.shape[-1]:
+            print("n_fft={} is too small for input signal of length={}".format(
+                n_fft, y.shape[-1]))
+
+        y = np.pad(y, int(n_fft // 2), mode=pad_mode)
+
+    elif n_fft > y.shape[-1]:
+        raise Exception("n_fft={} is too large for input signal of length={}".
+                        format(n_fft, y.shape[-1]))
+
+    # Window the time series.
+    y_frames = frame(y, frame_length=n_fft, hop_length=hop_length)
+
+    if dtype is None:
+        dtype = dtype_r2c(y.dtype)
+
+    # Pre-allocate the STFT matrix
+    stft_matrix = np.empty(
+        (int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order="F")
+
+    # how many columns can we fit within MAX_MEM_BLOCK?
+    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
+    n_columns = max(n_columns, 1)
+
+    for bl_s in range(0, stft_matrix.shape[1], n_columns):
+        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
+
+        stft_matrix[:, bl_s:bl_t] = fft.rfft(
+            fft_window * y_frames[:, bl_s:bl_t], axis=0)
+
+    if input_rank == 2:
+        stft_matrix = np.expand_dims(stft_matrix, 0)
+
+    return stft_matrix
+
+
+def istft(
+        x,
+        hop_length=None,
+        win_length=None,
+        window="hann",
+        center=True,
+        length=None, ):
+
+    stft_matrix = x
+    input_rank = len(stft_matrix.shape)
+    if input_rank == 3:
+        assert stft_matrix.shape[0] == 1  # Only 2d input supported in librosa
+        stft_matrix = stft_matrix.squeeze(0)
+    dtype = None
+
+    n_fft = 2 * (stft_matrix.shape[0] - 1)
+
+    # By default, use the entire frame
+    if win_length is None:
+        win_length = n_fft
+
+    # Set the default hop, if it's not already specified
+    if hop_length is None:
+        hop_length = int(win_length // 4)
+
+    ifft_window = get_window(window, win_length, fftbins=True)
+
+    # Pad out to match n_fft, and add a broadcasting axis
+    ifft_window = pad_center(ifft_window, n_fft)[:, np.newaxis]
+
+    # For efficiency, trim STFT frames according to signal length if available
+    if length:
+        if center:
+            padded_length = length + int(n_fft)
+        else:
+            padded_length = length
+        n_frames = min(stft_matrix.shape[1],
+                       int(np.ceil(padded_length / hop_length)))
+    else:
+        n_frames = stft_matrix.shape[1]
+
+    expected_signal_len = n_fft + hop_length * (n_frames - 1)
+
+    if dtype is None:
+        dtype = dtype_c2r(stft_matrix.dtype)
+
+    y = np.zeros(expected_signal_len, dtype=dtype)
+
+    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
+    n_columns = min(n_columns, 1)
+
+    frame = 0
+    for bl_s in range(0, n_frames, n_columns):
+        bl_t = min(bl_s + n_columns, n_frames)
+
+        # invert the block and apply the window function
+        ytmp = ifft_window * fft.irfft(stft_matrix[:, bl_s:bl_t], axis=0)
+
+        # Overlap-add the istft block starting at the i'th frame
+        __overlap_add(y[frame * hop_length:], ytmp, hop_length)
+
+        frame += bl_t - bl_s
+
+    # Normalize by sum of squared window
+    ifft_window_sum = window_sumsquare(
+        window,
+        n_frames,
+        win_length=win_length,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        dtype=dtype, )
+
+    approx_nonzero_indices = ifft_window_sum > tiny(ifft_window_sum)
+    y[approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices]
+
+    if length is None:
+        # If we don't need to control length, just do the usual center trimming
+        # to eliminate padded data
+        if center:
+            y = y[int(n_fft // 2):-int(n_fft // 2)]
+    else:
+        if center:
+            # If we're centering, crop off the first n_fft//2 samples
+            # and then trim/pad to the target length.
+            # We don't trim the end here, so that if the signal is zero-padded
+            # to a longer duration, the decay is smooth by windowing
+            start = int(n_fft // 2)
+        else:
+            # If we're not centering, start at 0 and trim/pad as necessary
+            start = 0
+
+        y = fix_length(y[start:], length)
+
+    if input_rank == 3:
+        y = np.expand_dims(y, 0)
+
+    return y
+
+
+def frame_for_api_test(x, frame_length, hop_length, axis=-1):
+    if axis == -1 and not x.flags["C_CONTIGUOUS"]:
+        x = np.ascontiguousarray(x)
+    elif axis == 0 and not x.flags["F_CONTIGUOUS"]:
+        x = np.asfortranarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * x.itemsize]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * x.itemsize] + list(strides)
+
+    else:
+        raise ValueError("Frame axis={} must be either 0 or -1".format(axis))
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def overlap_add_for_api_test(x, hop_length, axis=-1):
+    assert axis in [0, -1], 'axis should be 0/-1.'
+    assert len(x.shape) >= 2, 'Input dims shoulb be >= 2.'
+
+    squeeze_output = False
+    if len(x.shape) == 2:
+        squeeze_output = True
+        dim = 0 if axis == -1 else -1
+        x = np.expand_dims(x, dim)  # batch
+
+    n_frames = x.shape[axis]
+    frame_length = x.shape[1] if axis == 0 else x.shape[-2]
+
+    # Assure no gaps between frames.
+    assert 0 < hop_length <= frame_length, \
+        f'hop_length should be in (0, frame_length({frame_length})], but got {hop_length}.'
+
+    seq_length = (n_frames - 1) * hop_length + frame_length
+
+    reshape_output = False
+    if len(x.shape) > 3:
+        reshape_output = True
+        if axis == 0:
+            target_shape = [seq_length] + list(x.shape[2:])
+            x = x.reshape(n_frames, frame_length, np.product(x.shape[2:]))
+        else:
+            target_shape = list(x.shape[:-2]) + [seq_length]
+            x = x.reshape(np.product(x.shape[:-2]), frame_length, n_frames)
+
+    if axis == 0:
+        x = x.transpose((2, 1, 0))
+
+    y = np.zeros(shape=[np.product(x.shape[:-2]), seq_length], dtype=x.dtype)
+    for i in range(x.shape[0]):
+        for frame in range(x.shape[-1]):
+            sample = frame * hop_length
+            y[i, sample:sample + frame_length] += x[i, :, frame]
+
+    if axis == 0:
+        y = y.transpose((1, 0))
+
+    if reshape_output:
+        y = y.reshape(target_shape)
+
+    if squeeze_output:
+        y = y.squeeze(-1) if axis == 0 else y.squeeze(0)
+
+    return y
+
+
+def place(devices, key='place'):
+    def decorate(cls):
+        module = sys.modules[cls.__module__].__dict__
+        raw_classes = {
+            k: v
+            for k, v in module.items() if k.startswith(cls.__name__)
+        }
+
+        for raw_name, raw_cls in raw_classes.items():
+            for d in devices:
+                test_cls = dict(raw_cls.__dict__)
+                test_cls.update({key: d})
+                new_name = raw_name + '.' + d.__class__.__name__
+                module[new_name] = type(new_name, (raw_cls, ), test_cls)
+            del module[raw_name]
+        return cls
+
+    return decorate
+
+
+def setUpModule():
+    global rtol
+    global atol
+    # All test case will use float64 for compare percision, refs:
+    # https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64
+    rtol = {
+        'float32': 1e-06,
+        'float64': 1e-7,
+        'complex64': 1e-06,
+        'complex128': 1e-7,
+    }
+    atol = {
+        'float32': 0.0,
+        'float64': 0.0,
+        'complex64': 0.0,
+        'complex128': 0.0,
+    }
+
+
+def tearDownModule():
+    pass
+
+
+def rand_x(dims=1,
+           dtype='float64',
+           min_dim_len=1,
+           max_dim_len=10,
+           shape=None,
+           complex=False):
+
+    if shape is None:
+        shape = [
+            np.random.randint(min_dim_len, max_dim_len) for i in range(dims)
+        ]
+    if complex:
+        return np.random.randn(*shape).astype(dtype) + 1.j * np.random.randn(
+            *shape).astype(dtype)
+    else:
+        return np.random.randn(*shape).astype(dtype)
+
+
+def parameterize(attrs, input_values=None):
+
+    if isinstance(attrs, str):
+        attrs = [attrs]
+    input_dicts = (attrs if input_values is None else
+                   [dict(zip(attrs, vals)) for vals in input_values])
+
+    def decorator(base_class):
+        test_class_module = sys.modules[base_class.__module__].__dict__
+        for idx, input_dict in enumerate(input_dicts):
+            test_class_dict = dict(base_class.__dict__)
+            test_class_dict.update(input_dict)
+
+            name = class_name(base_class, idx, input_dict)
+
+            test_class_module[name] = type(name, (base_class, ),
+                                           test_class_dict)
+
+        for method_name in list(base_class.__dict__):
+            if method_name.startswith("test"):
+                delattr(base_class, method_name)
+        return base_class
+
+    return decorator
+
+
+def class_name(cls, num, params_dict):
+    suffix = to_safe_name(
+        next((v for v in params_dict.values() if isinstance(v, str)), ""))
+    if TEST_CASE_NAME in params_dict:
+        suffix = to_safe_name(params_dict["test_case"])
+    return "{}_{}{}".format(cls.__name__, num, suffix and "_" + suffix)
+
+
+def to_safe_name(s):
+    return str(re.sub("[^a-zA-Z0-9_]+", "_", s))
+
+
+# yapf: disable
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'frame_length', 'hop_length', 'axis'),
+    [
+        ('test_1d_input1', rand_x(1, np.float64, shape=[150]), 50, 15, 0),
+        ('test_1d_input2', rand_x(1, np.float64, shape=[150]), 50, 15, -1),
+        ('test_2d_input1', rand_x(2, np.float64, shape=[150, 8]), 50, 15, 0),
+        ('test_2d_input2', rand_x(2, np.float64, shape=[8, 150]), 50, 15, -1),
+        ('test_3d_input1', rand_x(3, np.float64, shape=[150, 4, 2]), 50, 15, 0),
+        ('test_3d_input2', rand_x(3, np.float64, shape=[4, 2, 150]), 50, 15, -1),
+    ])
+class TestFrame(unittest.TestCase):
+    def test_frame(self):
+        self.assertTrue(
+            np.allclose(
+                frame_for_api_test(self.x, self.frame_length, self.hop_length, self.axis),
+                paddle.tensor.signal.frame(
+                    paddle.to_tensor(self.x),
+                    self.frame_length,
+                    self.hop_length,
+                    self.axis),
+                rtol=rtol.get(str(self.x.dtype)),
+                atol=atol.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'frame_length', 'hop_length', 'axis'),
+    [
+        ('test_1d_input1', rand_x(1, np.float64, shape=[150]), 50, 15, 0),
+        ('test_1d_input2', rand_x(1, np.float64, shape=[150]), 50, 15, -1),
+        ('test_2d_input1', rand_x(2, np.float64, shape=[150, 8]), 50, 15, 0),
+        ('test_2d_input2', rand_x(2, np.float64, shape=[8, 150]), 50, 15, -1),
+        ('test_3d_input1', rand_x(3, np.float64, shape=[150, 4, 2]), 50, 15, 0),
+        ('test_3d_input2', rand_x(3, np.float64, shape=[4, 2, 150]), 50, 15, -1),
+    ])
+class TestFrameStatic(unittest.TestCase):
+    def test_frame_static(self):
+        paddle.enable_static()
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            input = paddle.static.data('input', self.x.shape, dtype=self.x.dtype)
+            output = paddle.tensor.signal.frame(
+                     input,
+                     self.frame_length,
+                     self.hop_length,
+                     self.axis),
+        exe = paddle.static.Executor(self.place)
+        exe.run(sp)
+        [output] = exe.run(mp, feed={'input': self.x}, fetch_list=[output])
+        paddle.disable_static()
+
+        self.assertTrue(
+            np.allclose(
+                frame_for_api_test(self.x, self.frame_length, self.hop_length, self.axis),
+                output,
+                rtol=rtol.get(str(self.x.dtype)),
+                atol=atol.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'frame_length', 'hop_length', 'axis', 'expect_exception'),
+    [
+        ('test_axis', rand_x(1, np.float64, shape=[150]), 50, 15, 2, ValueError),
+        ('test_hop_length', rand_x(1, np.float64, shape=[150]), 50, 0, -1, ValueError),
+        ('test_frame_length1', rand_x(2, np.float64, shape=[150, 8]), 0, 15, 0, ValueError),
+        ('test_frame_length2', rand_x(2, np.float64, shape=[150, 8]), 151, 15, 0, ValueError),
+    ])
+class TestFrameException(unittest.TestCase):
+    def test_frame(self):
+        with self.assertRaises(self.expect_exception):
+            paddle.tensor.signal.frame(
+                paddle.to_tensor(self.x),
+                self.frame_length,
+                self.hop_length,
+                self.axis)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'hop_length', 'axis'),
+    [
+        ('test_2d_input1', rand_x(2, np.float64, shape=[3, 50]), 4, 0),
+        ('test_2d_input2', rand_x(2, np.float64, shape=[50, 3]), 4, -1),
+        ('test_3d_input1', rand_x(3, np.float64, shape=[5, 40, 2]), 10, 0),
+        ('test_3d_input2', rand_x(3, np.float64, shape=[2, 40, 5]), 10, -1),
+        ('test_4d_input1', rand_x(4, np.float64, shape=[8, 12, 5, 3]), 5, 0),
+        ('test_4d_input2', rand_x(4, np.float64, shape=[3, 5, 12, 8]), 5, -1),
+    ])
+class TestOverlapAdd(unittest.TestCase):
+    def test_overlap_add(self):
+        self.assertTrue(
+            np.allclose(
+                overlap_add_for_api_test(self.x, self.hop_length, self.axis),
+                paddle.tensor.signal.overlap_add(
+                    paddle.to_tensor(self.x),
+                    self.hop_length,
+                    self.axis),
+                rtol=rtol.get(str(self.x.dtype)),
+                atol=atol.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'hop_length', 'axis'),
+    [
+        ('test_2d_input1', rand_x(2, np.float64, shape=[3, 50]), 4, 0),
+        ('test_2d_input2', rand_x(2, np.float64, shape=[50, 3]), 4, -1),
+        ('test_3d_input1', rand_x(3, np.float64, shape=[5, 40, 2]), 10, 0),
+        ('test_3d_input2', rand_x(3, np.float64, shape=[2, 40, 5]), 10, -1),
+        ('test_4d_input1', rand_x(4, np.float64, shape=[8, 12, 5, 3]), 5, 0),
+        ('test_4d_input2', rand_x(4, np.float64, shape=[3, 5, 12, 8]), 5, -1),
+    ])
+class TestOverlapAddStatic(unittest.TestCase):
+    def test_overlap_add_static(self):
+        paddle.enable_static()
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            input = paddle.static.data('input', self.x.shape, dtype=self.x.dtype)
+            output = paddle.tensor.signal.overlap_add(
+                     input,
+                     self.hop_length,
+                     self.axis),
+        exe = paddle.static.Executor(self.place)
+        exe.run(sp)
+        [output] = exe.run(mp, feed={'input': self.x}, fetch_list=[output])
+        paddle.disable_static()
+
+        self.assertTrue(
+            np.allclose(
+                overlap_add_for_api_test(self.x, self.hop_length, self.axis),
+                output,
+                rtol=rtol.get(str(self.x.dtype)),
+                atol=atol.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'hop_length', 'axis', 'expect_exception'),
+    [
+        ('test_axis', rand_x(2, np.float64, shape=[3, 50]), 4, 2, ValueError),
+        ('test_hop_length', rand_x(2, np.float64, shape=[50, 3]), -1, -1, ValueError),
+    ])
+class TestOverlapAddException(unittest.TestCase):
+    def test_overlap_add(self):
+        with self.assertRaises(self.expect_exception):
+            paddle.tensor.signal.overlap_add(
+                paddle.to_tensor(self.x),
+                self.hop_length,
+                self.axis)
+
+
+# ================= STFT
+# common args
+#   x
+#   n_fft,
+#   hop_length=None,
+#   win_length=None,
+#   window=None,
+#   center=True,
+#   pad_mode='reflect',
+
+# paddle only
+#   normalized=False,
+#   onesided=True,
+
+# ================= ISTFT
+# common args
+#    x,
+#    hop_length=None,
+#    win_length=None,
+#    window=None,
+#    center=True,
+#    length=None,
+
+# paddle only
+#    n_fft,
+#    normalized=False,
+#    onesided=True,
+#    return_complex=False,
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n_fft', 'hop_length', 'win_length', 'window', 'center', 'pad_mode', 'normalized', 'onesided'),
+    [
+        ('test_1d_input', rand_x(1, np.float64, shape=[160000]),
+        512, None, None, get_window('hann', 512), True, 'reflect', False, True),
+        ('test_2d_input', rand_x(2, np.float64, shape=[1, 160000]),
+        512, None, None, get_window('hann', 512), True, 'reflect', False, True),
+        ('test_hop_length', rand_x(2, np.float64, shape=[1, 160000]),
+        512, 255, None, get_window('hann', 512), True, 'reflect', False, True),
+        ('test_win_length', rand_x(2, np.float64, shape=[1, 160000]),
+        512, 255, 499, get_window('hann', 499), True, 'reflect', False, True),
+        ('test_window', rand_x(2, np.float64, shape=[1, 160000]),
+        512, None, None, None, True, 'reflect', False, True),
+        ('test_center', rand_x(2, np.float64, shape=[1, 160000]),
+        512, None, None, None, False, 'reflect', False, True),
+    ])
+class TestStft(unittest.TestCase):
+    def test_stft(self):
+        if self.window is None:
+            win_p = None
+            win_l = 'boxcar'  # rectangular window
+        else:
+            win_p = paddle.to_tensor(self.window)
+            win_l = self.window
+
+        self.assertTrue(
+            np.allclose(
+                stft(self.x, self.n_fft, self.hop_length, self.win_length, win_l, self.center, self.pad_mode),
+                paddle.tensor.signal.stft(
+                    paddle.to_tensor(self.x),
+                    self.n_fft,
+                    self.hop_length,
+                    self.win_length,
+                    win_p,
+                    self.center,
+                    self.pad_mode,
+                    self.normalized,
+                    self.onesided),
+                rtol=rtol.get(str(self.x.dtype)),
+                atol=atol.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n_fft', 'hop_length', 'win_length', 'window', 'center', 'pad_mode', 'normalized', 'onesided', 'expect_exception'),
+    [
+        ('test_dims', rand_x(1, np.float64, shape=[1, 2, 3]),
+        512, None, None, None, True, 'reflect', False, True, AssertionError),
+        ('test_hop_length', rand_x(1, np.float64, shape=[16000]),
+        512, 0, None, None, True, 'reflect', False, True, AssertionError),
+        ('test_nfft1', rand_x(1, np.float64, shape=[16000]),
+        0, None, None, None, True, 'reflect', False, True, AssertionError),
+        ('test_nfft2', rand_x(1, np.float64, shape=[16000]),
+        16001, None, None, None, True, 'reflect', False, True, AssertionError),
+        ('test_win_length', rand_x(1, np.float64, shape=[16000]),
+        512, None, 0, None, True, 'reflect', False, True, AssertionError),
+        ('test_win_length', rand_x(1, np.float64, shape=[16000]),
+        512, None, 513, None, True, 'reflect', False, True, AssertionError),
+        ('test_pad_mode', rand_x(1, np.float64, shape=[16000]),
+        512, None, None, None, True, 'nonsense', False, True, AssertionError),
+        ('test_complex_onesided', rand_x(1, np.float64, shape=[16000], complex=True),
+        512, None, None, None, False, 'reflect', False, True, AssertionError),
+    ])
+class TestStftException(unittest.TestCase):
+    def test_stft(self):
+        if self.window is None:
+            win_p = None
+        else:
+            win_p = paddle.to_tensor(self.window)
+
+        with self.assertRaises(self.expect_exception):
+            paddle.tensor.signal.stft(
+                paddle.to_tensor(self.x),
+                self.n_fft,
+                self.hop_length,
+                self.win_length,
+                win_p,
+                self.center,
+                self.pad_mode,
+                self.normalized,
+                self.onesided),
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n_fft', 'hop_length', 'win_length', 'window', 'center', 'normalized', 'onesided', 'length', 'return_complex'),
+    [
+        ('test_2d_input', rand_x(2, np.float64, shape=[257, 471], complex=True),
+        512, None, None, get_window('hann', 512), True, False, True, None, False),
+        ('test_3d_input', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, None, None, get_window('hann', 512), True, False, True, None, False),
+        ('test_hop_length', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, 99, None, get_window('hann', 512), True, False, True, None, False),
+        ('test_win_length', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, 99, 299, get_window('hann', 299), True, False, True, None, False),
+        ('test_window', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, None, None, None, True, False, True, None, False),
+        ('test_center', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, None, None, None, False, False, True, None, False),
+        ('test_length', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, None, None, None, False, False, True, 1888, False),
+    ])
+class TestIstft(unittest.TestCase):
+    def test_istft(self):
+        if self.window is None:
+            win_p = None
+            win_l = 'boxcar'  # rectangular window
+        else:
+            win_p = paddle.to_tensor(self.window)
+            win_l = self.window
+
+        self.assertTrue(
+            np.allclose(
+                istft(self.x, self.hop_length, self.win_length, win_l, self.center, self.length),
+                paddle.tensor.signal.istft(
+                    paddle.to_tensor(self.x),
+                    self.n_fft,
+                    self.hop_length,
+                    self.win_length,
+                    win_p,
+                    self.center,
+                    self.normalized,
+                    self.onesided,
+                    self.length,
+                    self.return_complex),
+                rtol=rtol.get(str(self.x.dtype)),
+                atol=atol.get(str(self.x.dtype))))
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n_fft', 'hop_length', 'win_length', 'window', 'center', 'normalized', 'onesided', 'length', 'return_complex', 'expect_exception'),
+    [
+        ('test_dims', rand_x(4, np.float64, shape=[1, 2, 3, 4], complex=True),
+        512, None, None, get_window('hann', 512), True, False, True, None, False, AssertionError),
+        ('test_n_fft', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        257, None, None, get_window('hann', 512), True, False, True, None, False, AssertionError),
+        ('test_hop_length1', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, 0, None, get_window('hann', 512), True, False, True, None, False, AssertionError),
+        ('test_hop_length2', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, 513, None, get_window('hann', 512), True, False, True, None, False, AssertionError),
+        ('test_win_length1', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, None, 0, get_window('hann', 512), True, False, True, None, False, AssertionError),
+        ('test_win_length2', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, None, 513, get_window('hann', 512), True, False, True, None, False, AssertionError),
+        ('test_onesided1', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        20, None, None, get_window('hann', 512), True, False, True, None, False, AssertionError),
+        ('test_onesided2', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        256, None, None, None, True, False, False, None, False, AssertionError),
+        ('test_window', rand_x(3, np.float64, shape=[1, 512, 471], complex=True),
+        512, None, 511, get_window('hann', 512), True, False, False, None, False, AssertionError),
+        ('test_return_complex1', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, None, None, get_window('hann', 512), True, False, True, None, True, AssertionError),
+        ('test_return_complex2', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, None, None, rand_x(1, np.float64, shape=[512], complex=True), True, False, True, None, False, AssertionError),
+        ('test_NOLA', rand_x(3, np.float64, shape=[1, 257, 471], complex=True),
+        512, 512, None, get_window('hann', 512), True, False, True, None, False, ValueError),
+    ])
+class TestIstftException(unittest.TestCase):
+    def test_istft(self):
+        if self.window is None:
+            win_p = None
+        else:
+            win_p = paddle.to_tensor(self.window)
+
+        with self.assertRaises(self.expect_exception):
+            paddle.tensor.signal.istft(
+                paddle.to_tensor(self.x),
+                self.n_fft,
+                self.hop_length,
+                self.win_length,
+                win_p,
+                self.center,
+                self.normalized,
+                self.onesided,
+                self.length,
+                self.return_complex),
+
+
+# yapf: enable
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_solve_op.py b/python/paddle/fluid/tests/unittests/test_solve_op.py
new file mode 100644
index 0000000000000..fd527ec90f217
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_solve_op.py
@@ -0,0 +1,563 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.w
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+# 2D normal case
+class TestSolveOp(OpTest):
+    def config(self):
+        self.input_x_matrix_shape = [15, 15]
+        self.input_y_matrix_shape = [15, 10]
+        self.dtype = "float64"
+
+    def setUp(self):
+        paddle.enable_static()
+        self.config()
+        self.op_type = "solve"
+
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random(self.input_x_matrix_shape).astype(self.dtype),
+            'Y': np.random.random(self.input_y_matrix_shape).astype(self.dtype)
+        }
+        self.outputs = {
+            'Out': np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+# x broadcast + 3D batch case 
+class TestSolveOpBatched_case0(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((11, 11)).astype(self.dtype),
+            'Y': np.random.random((2, 11, 7)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-1)
+
+
+# 3D batch + y vector case
+class TestSolveOpBatched_case1(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((20, 6, 6)).astype(self.dtype),
+            'Y': np.random.random((20, 6)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.04)
+
+
+# 3D batch + y broadcast case
+class TestSolveOpBatched_case2(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((2, 10, 10)).astype(self.dtype),
+            'Y': np.random.random((1, 10, 10)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
+
+
+# x broadcast + 3D batch case 
+class TestSolveOpBatched_case3(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((1, 10, 10)).astype(self.dtype),
+            'Y': np.random.random((2, 10, 10)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
+
+
+# 3D normal batch case 
+class TestSolveOpBatched_case4(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((3, 6, 6)).astype(self.dtype),
+            'Y': np.random.random((3, 6, 7)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+# 4D normal batch case
+class TestSolveOpBatched_case5(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((2, 2, 6, 6)).astype(self.dtype),
+            'Y': np.random.random((2, 2, 6, 6)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+# 4D batch + y broadcast case
+class TestSolveOpBatched_case6(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((2, 2, 6, 6)).astype(self.dtype),
+            'Y': np.random.random((1, 2, 6, 9)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+# 5D normal batch case
+class TestSolveOpBatched_case7(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((2, 2, 2, 4, 4)).astype(self.dtype),
+            'Y': np.random.random((2, 2, 2, 4, 4)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.04)
+
+
+# 5D batch + y broadcast case
+class TestSolveOpBatched_case8(OpTest):
+    def setUp(self):
+        self.op_type = "solve"
+        self.dtype = "float64"
+        np.random.seed(2021)
+        self.inputs = {
+            'X': np.random.random((2, 2, 2, 4, 4)).astype(self.dtype),
+            'Y': np.random.random((1, 2, 2, 4, 7)).astype(self.dtype)
+        }
+        result = np.linalg.solve(self.inputs['X'], self.inputs['Y'])
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.04)
+
+
+class TestSolveOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The input type of solve_op must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, paddle.linalg.solve, x1, y1)
+
+            # The data type of input must be float32 or float64.        
+            x2 = fluid.data(name="x2", shape=[30, 30], dtype="bool")
+            y2 = fluid.data(name="y2", shape=[30, 10], dtype="bool")
+            self.assertRaises(TypeError, paddle.linalg.solve, x2, y2)
+
+            x3 = fluid.data(name="x3", shape=[30, 30], dtype="int32")
+            y3 = fluid.data(name="y3", shape=[30, 10], dtype="int32")
+            self.assertRaises(TypeError, paddle.linalg.solve, x3, y3)
+
+            x4 = fluid.data(name="x4", shape=[30, 30], dtype="int64")
+            y4 = fluid.data(name="y4", shape=[30, 10], dtype="int64")
+            self.assertRaises(TypeError, paddle.linalg.solve, x4, y4)
+
+            x5 = fluid.data(name="x5", shape=[30, 30], dtype="float16")
+            y5 = fluid.data(name="y5", shape=[30, 10], dtype="float16")
+            self.assertRaises(TypeError, paddle.linalg.solve, x5, y5)
+
+            # The number of dimensions of input'X must be >= 2.
+            x6 = fluid.data(name="x6", shape=[30], dtype="float64")
+            y6 = fluid.data(name="y6", shape=[30], dtype="float64")
+            self.assertRaises(ValueError, paddle.linalg.solve, x6, y6)
+
+            # The inner-most 2 dimensions of input'X should be equal to each other
+            x7 = fluid.data(name="x7", shape=[2, 3, 4], dtype="float64")
+            y7 = fluid.data(name="y7", shape=[2, 4, 3], dtype="float64")
+            self.assertRaises(ValueError, paddle.linalg.solve, x7, y7)
+
+
+# 2D + vector case, FP64
+class TestSolveOpAPI_1(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2021)
+        self.place = [paddle.CPUPlace()]
+        self.dtype = "float64"
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            paddle_input_x = fluid.data(
+                name="input_x", shape=[3, 3], dtype=self.dtype)
+            paddle_input_y = fluid.data(
+                name="input_y", shape=[3], dtype=self.dtype)
+            paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
+
+            np_input_x = np.random.random([3, 3]).astype(self.dtype)
+            np_input_y = np.random.random([3]).astype(self.dtype)
+
+            np_result = np.linalg.solve(np_input_x, np_input_y)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(
+                fluid.default_main_program(),
+                feed={"input_x": np_input_x,
+                      "input_y": np_input_y},
+                fetch_list=[paddle_result])
+            self.assertTrue(
+                np.allclose(fetches[0], np.linalg.solve(np_input_x,
+                                                        np_input_y)))
+
+    def test_static(self):
+        for place in self.place:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            input_x_np = np.random.random([3, 3]).astype(self.dtype)
+            input_y_np = np.random.random([3]).astype(self.dtype)
+
+            tensor_input_x = paddle.to_tensor(input_x_np)
+            tensor_input_y = paddle.to_tensor(input_y_np)
+
+            numpy_output = np.linalg.solve(input_x_np, input_y_np)
+            paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
+            self.assertEqual(
+                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+# 2D normal case, FP64
+class TestSolveOpAPI_2(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2021)
+        self.place = [paddle.CPUPlace()]
+        self.dtype = "float64"
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            paddle_input_x = fluid.data(
+                name="input_x", shape=[10, 10], dtype=self.dtype)
+            paddle_input_y = fluid.data(
+                name="input_y", shape=[10, 4], dtype=self.dtype)
+            paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
+
+            np_input_x = np.random.random([10, 10]).astype(self.dtype)
+            np_input_y = np.random.random([10, 4]).astype(self.dtype)
+
+            np_result = np.linalg.solve(np_input_x, np_input_y)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(
+                fluid.default_main_program(),
+                feed={"input_x": np_input_x,
+                      "input_y": np_input_y},
+                fetch_list=[paddle_result])
+            self.assertTrue(
+                np.allclose(fetches[0], np.linalg.solve(np_input_x,
+                                                        np_input_y)))
+
+    def test_static(self):
+        for place in self.place:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            input_x_np = np.random.random([10, 10]).astype(self.dtype)
+            input_y_np = np.random.random([10, 4]).astype(self.dtype)
+
+            tensor_input_x = paddle.to_tensor(input_x_np)
+            tensor_input_y = paddle.to_tensor(input_y_np)
+
+            numpy_output = np.linalg.solve(input_x_np, input_y_np)
+            paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
+            self.assertEqual(
+                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+# 2D normal case, FP32
+class TestSolveOpAPI_3(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2021)
+        self.place = [paddle.CPUPlace()]
+        self.dtype = "float32"
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            paddle_input_x = fluid.data(
+                name="input_x", shape=[10, 10], dtype=self.dtype)
+            paddle_input_y = fluid.data(
+                name="input_y", shape=[10, 4], dtype=self.dtype)
+            paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
+
+            np_input_x = np.random.random([10, 10]).astype(self.dtype)
+            np_input_y = np.random.random([10, 4]).astype(self.dtype)
+
+            np_result = np.linalg.solve(np_input_x, np_input_y)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(
+                fluid.default_main_program(),
+                feed={"input_x": np_input_x,
+                      "input_y": np_input_y},
+                fetch_list=[paddle_result])
+            self.assertTrue(
+                np.allclose(
+                    fetches[0],
+                    np.linalg.solve(np_input_x, np_input_y),
+                    rtol=1.e-4))
+
+    def test_static(self):
+        for place in self.place:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            input_x_np = np.random.random([10, 10]).astype(self.dtype)
+            input_y_np = np.random.random([10, 4]).astype(self.dtype)
+
+            tensor_input_x = paddle.to_tensor(input_x_np)
+            tensor_input_y = paddle.to_tensor(input_y_np)
+
+            numpy_output = np.linalg.solve(input_x_np, input_y_np)
+            paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
+            self.assertEqual(
+                np.allclose(
+                    numpy_output, paddle_output.numpy(), rtol=1.e-4),
+                True)
+            self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+# 3D + y broadcast case, FP64
+class TestSolveOpAPI_4(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2021)
+        self.place = [paddle.CPUPlace()]
+        self.dtype = "float64"
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            paddle_input_x = fluid.data(
+                name="input_x", shape=[2, 3, 3], dtype=self.dtype)
+            paddle_input_y = fluid.data(
+                name="input_y", shape=[1, 3, 3], dtype=self.dtype)
+            paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
+
+            np_input_x = np.random.random([2, 3, 3]).astype(self.dtype)
+            np_input_y = np.random.random([1, 3, 3]).astype(self.dtype)
+
+            np_result = np.linalg.solve(np_input_x, np_input_y)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(
+                fluid.default_main_program(),
+                feed={"input_x": np_input_x,
+                      "input_y": np_input_y},
+                fetch_list=[paddle_result])
+            self.assertTrue(
+                np.allclose(fetches[0], np.linalg.solve(np_input_x,
+                                                        np_input_y)))
+
+    def test_static(self):
+        for place in self.place:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            input_x_np = np.random.random([2, 3, 3]).astype(self.dtype)
+            input_y_np = np.random.random([1, 3, 3]).astype(self.dtype)
+
+            tensor_input_x = paddle.to_tensor(input_x_np)
+            tensor_input_y = paddle.to_tensor(input_y_np)
+
+            numpy_output = np.linalg.solve(input_x_np, input_y_np)
+            paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
+            self.assertEqual(
+                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+class TestSolveOpSingularAPI(unittest.TestCase):
+    # Singular matrix is ​​not invertible
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        self.dtype = "float64"
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[4, 4], dtype=self.dtype)
+            y = fluid.data(name="y", shape=[4, 4], dtype=self.dtype)
+
+            result = paddle.linalg.solve(x, y)
+
+            input_x_np = np.ones([4, 4]).astype(self.dtype)
+            input_y_np = np.ones([4, 4]).astype(self.dtype)
+
+            exe = fluid.Executor(place)
+            try:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"x": input_x_np,
+                                        "y": input_y_np},
+                                  fetch_list=[result])
+            except RuntimeError as ex:
+                print("The mat is singular")
+                pass
+            except ValueError as ex:
+                print("The mat is singular")
+                pass
+
+    def test_static(self):
+        for place in self.places:
+            paddle.enable_static()
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_x_np = np.ones([4, 4]).astype(self.dtype)
+                input_y_np = np.ones([4, 4]).astype(self.dtype)
+                input_x = fluid.dygraph.to_variable(input_x_np)
+                input_y = fluid.dygraph.to_variable(input_y_np)
+
+                try:
+                    result = paddle.linalg.solve(input_x, input_y)
+                except RuntimeError as ex:
+                    print("The mat is singular")
+                    pass
+                except ValueError as ex:
+                    print("The mat is singular")
+                    pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 59b4afdf8b02d..e255350fd6618 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -113,6 +113,99 @@ def initTestCase(self):
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
 
 
+class TestTransposeOpBool(TestTransposeOp):
+    def test_check_grad(self):
+        pass
+
+
+class TestTransposeOpBool1D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.axis = (0, )
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("bool"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+
+class TestTransposeOpBool2D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (3, 40)
+        self.axis = (1, 0)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("bool"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+
+class TestTransposeOpBool3D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (3, 4, 10)
+        self.axis = (0, 2, 1)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("bool"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+
+class TestTransposeOpBool4D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("bool"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+
+class TestTransposeOpBool5D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("bool"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+
+class TestTransposeOpBool6D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("bool"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+
+class TestTransposeOpBool7D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("bool"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+
+class TestTransposeOpBool8D(TestTransposeOpBool):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+        self.inputs = {'X': np.random.random(self.shape).astype("bool")}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("bool"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+
 class TestTransposeOpError(unittest.TestCase):
     def test_errors(self):
         paddle.enable_static()
@@ -126,9 +219,9 @@ def test_x_Variable_check():
             self.assertRaises(TypeError, test_x_Variable_check)
 
             def test_x_dtype_check():
-                # the Input(x)'s dtype must be one of [float16, float32, float64, int32, int64]
+                # the Input(x)'s dtype must be one of [bool, float16, float32, float64, int32, int64]
                 x1 = fluid.layers.data(
-                    name='x1', shape=[10, 5, 3], dtype='bool')
+                    name='x1', shape=[10, 5, 3], dtype='int8')
                 fluid.layers.transpose(x1, perm=[1, 0, 2])
 
             self.assertRaises(TypeError, test_x_dtype_check)
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index 2b3383239a0ce..26d63826cc87a 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -48,6 +48,7 @@
     'lgamma', \
     'svd', \
     'matrix_power', \
+    'solve', \
 ]
 
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
new file mode 100644
index 0000000000000..cbdd9db8ee7f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
@@ -0,0 +1,194 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+import paddle.fluid.core as core
+
+paddle.enable_static()
+
+
+class XPUBaseTestCase(XPUOpTest):
+    def initTestCase(self):
+        self.dims = (3, 4)
+        self.dtype = 'float32'
+        self.axis = 1
+
+    def setUp(self):
+        self.initTestCase()
+        self.__class__.op_type = 'arg_max'
+        self.__class__.use_xpu = True
+        np.random.seed(2021)
+        self.x = (np.random.random(self.dims)).astype(self.dtype)
+        self.inputs = {'X': self.x}
+        self.attrs = {'axis': self.axis, 'use_xpu': True}
+        if self.op_type == "arg_min":
+            self.outputs = {'Out': np.argmin(self.x, axis=self.axis)}
+        else:
+            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+# test argmax, dtype: float32
+class TestArgMaxFloat32Case1(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = -1
+
+
+class TestArgMaxFloat32Case2(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+
+
+class TestArgMaxFloat32Case3(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 1
+
+
+class TestArgMaxFloat32Case4(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 2
+
+
+class TestArgMaxFloat32Case5(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4)
+        self.dtype = 'float32'
+        self.axis = -1
+
+
+class TestArgMaxFloat32Case6(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4)
+        self.dtype = 'float32'
+        self.axis = 0
+
+
+class TestArgMaxFloat32Case7(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4)
+        self.dtype = 'float32'
+        self.axis = 1
+
+
+class TestArgMaxFloat32Case8(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (1, )
+        self.dtype = 'float32'
+        self.axis = 0
+
+
+class TestArgMaxFloat32Case9(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (2, )
+        self.dtype = 'float32'
+        self.axis = 0
+
+
+class TestArgMaxFloat32Case10(XPUBaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, )
+        self.dtype = 'float32'
+        self.axis = 0
+
+
+class TestArgMaxAPI(unittest.TestCase):
+    def initTestCase(self):
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+
+    def setUp(self):
+        self.initTestCase()
+        self.__class__.use_Xpu = True
+        self.place = [paddle.XPUPlace(0)]
+
+    def test_dygraph_api(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            numpy_input = (np.random.random(self.dims)).astype(self.dtype)
+            tensor_input = paddle.to_tensor(numpy_input)
+            numpy_output = np.argmax(numpy_input, axis=self.axis)
+            paddle_output = paddle.argmax(tensor_input, axis=self.axis)
+            self.assertEqual(
+                np.allclose(numpy_output, paddle_output.numpy()), True)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+class TestArgMaxAPI_2(unittest.TestCase):
+    def initTestCase(self):
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+        self.keep_dims = True
+
+    def setUp(self):
+        self.initTestCase()
+        self.__class__.use_xpu = True
+        self.place = [paddle.XPUPlace(0)]
+
+    def test_dygraph_api(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            numpy_input = (np.random.random(self.dims)).astype(self.dtype)
+            tensor_input = paddle.to_tensor(numpy_input)
+            numpy_output = np.argmax(
+                numpy_input, axis=self.axis).reshape(1, 4, 5)
+            paddle_output = paddle.argmax(
+                tensor_input, axis=self.axis, keepdim=self.keep_dims)
+            self.assertEqual(
+                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
new file mode 100644
index 0000000000000..b745dce9efef4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
@@ -0,0 +1,116 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import sys
+sys.path.append("..")
+
+import unittest
+import numpy as np
+import numpy.random as random
+import sys
+import math
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+
+paddle.enable_static()
+
+
+class TestXPUIOUSimilarityOp(XPUOpTest):
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def setUp(self):
+        self.op_type = "iou_similarity"
+        self.boxes1 = random.rand(2, 4).astype('float32')
+        self.boxes2 = random.rand(3, 4).astype('float32')
+        self.output = random.rand(2, 3).astype('float32')
+        self.box_normalized = False
+        # run python iou computation 
+        self._compute_iou()
+        self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
+        self.attrs = {"box_normalized": self.box_normalized, 'use_xpu': True}
+        self.outputs = {'Out': self.output}
+
+    def _compute_iou(self, ):
+        for row in range(self.boxes1.shape[0]):
+            for col in range(self.boxes2.shape[0]):
+                xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
+                xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
+                if not self.box_normalized:
+                    area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1)
+                    area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1)
+                else:
+                    area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
+                    area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
+
+                inter_xmax = min(xmax1, xmax2)
+                inter_ymax = min(ymax1, ymax2)
+                inter_xmin = max(xmin1, xmin2)
+                inter_ymin = max(ymin1, ymin2)
+                inter_height = inter_ymax - inter_ymin
+                inter_width = inter_xmax - inter_xmin
+                if not self.box_normalized:
+                    inter_height += 1
+                    inter_width += 1
+                inter_height = max(inter_height, 0)
+                inter_width = max(inter_width, 0)
+                inter_area = inter_width * inter_height
+                union_area = area1 + area2 - inter_area
+                sim_score = inter_area / union_area
+                self.output[row, col] = sim_score
+
+
+class TestXPUIOUSimilarityOpWithLoD(TestXPUIOUSimilarityOp):
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, check_dygraph=False)
+
+    def setUp(self):
+        super(TestXPUIOUSimilarityOpWithLoD, self).setUp()
+        self.boxes1_lod = [[1, 1]]
+        self.output_lod = [[1, 1]]
+        self.box_normalized = False
+        # run python iou computation 
+        self._compute_iou()
+        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
+        self.attrs = {"box_normalized": self.box_normalized}
+        self.outputs = {'Out': (self.output, self.output_lod)}
+
+
+class TestXPUIOUSimilarityOpWithBoxNormalized(TestXPUIOUSimilarityOp):
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, check_dygraph=False)
+
+    def setUp(self):
+        super(TestXPUIOUSimilarityOpWithBoxNormalized, self).setUp()
+        self.boxes1_lod = [[1, 1]]
+        self.output_lod = [[1, 1]]
+        self.box_normalized = True
+        # run python iou computation 
+        self._compute_iou()
+        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
+        self.attrs = {"box_normalized": self.box_normalized}
+        self.outputs = {'Out': (self.output, self.output_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse.py b/python/paddle/incubate/operators/softmax_mask_fuse.py
index bbc4175b0d1c4..4c95a1ab51288 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse.py
@@ -20,6 +20,43 @@
 
 
 def softmax_mask_fuse(x, mask, name=None):
+    """
+    Do a masked softmax on x.
+
+    This is designed for speeding up Transformer structure.
+    Used for reducing operation such as: tmp = x + mask, out = softmax(tmp).
+    The equation is:
+
+    .. math::
+        out = softmax(x + mask)
+
+    **Note**:
+        This API only supports GPU.
+
+    Args:
+        x (4-D Tensor): The input tensor, should be in 4D shape, it's data type should be float16, float32.
+                        The fourth dimension of x must be larger or equal to 32 and less then 8192.
+        mask (4-D Tensor): The input tensor, should be in 4D shape, it's data type should be float16, float32.
+                           The second dimension of mask must be 1, and other dimensions must be same with x.
+        name (str, optional): Name for the operation (optional, default is None).
+                              For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        4-D Tensor. A location into which the result is stored. It’s dimension is 4D. Has same shape with x.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            import paddle.incubate as incubate
+
+            x = paddle.rand([2, 8, 8, 32])
+            mask = paddle.rand([2, 1, 8, 32])
+
+            rst = incubate.softmax_mask_fuse(x, mask)
+            # [[[[0.02404429, 0.04658398, 0.02746007, ..., 0.01489375, 0.02397441, 0.02851614] ... ]]]
+    """
     if in_dygraph_mode():
         out = core.ops.fused_softmax_mask(x, mask)
         return out
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
index 636d0f5f9dd3b..918adf8c21a1c 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
@@ -21,11 +21,41 @@
 
 def softmax_mask_fuse_upper_triangle(x):
     """
-    Fuse softmax mask together without even give a mask.
-    Under GPT model, the mask is always be a upper triangle
-    so we can simply mask the upper triangle part of x to get the mask result
-    :param x: the input x (rst of QK)
-    :return: the result of softmax mask fuse (upper triangle)
+    Do a masked softmax on x, which will always mask upper triangle part of x.
+
+    This is designed for speeding up GPT kind Transformer structure.
+    Used for reducing operation such as: tmp = x + mask, out = softmax(tmp), where the mask is
+    always be an upper triangle matrix.
+    The equation is:
+
+    .. math::
+        out = softmax(LowerTriangular(x))
+
+    **Note**:
+        This API only supports GPU.
+
+    Args:
+        x (4-D Tensor): The input tensor, should be in 4D shape, it's data type should be float16, float32
+                        The fourth dimension of x must be larger or equal to 32 and less then 8192.
+                        The third dimension of x must be same with the fourth dimension of x.
+
+    Returns:
+        4-D Tensor. A location into which the result is stored. It’s dimension is 4D. Has same dimension with x.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            import paddle.incubate as incubate
+
+            x = paddle.rand((1, 1, 32, 32))
+
+            rst = incubate.softmax_mask_fuse_upper_triangle(x)
+            # [[[[1.        , 0.        , 0.        , ..., 0., 0., 0.],
+            #    [0.45324376, 0.54675621, 0.        , ..., 0., 0., 0.],
+            #    [0.32674268, 0.28156221, 0.39169508, ..., 0., 0., 0.]
+            #     ... ]]]
     """
     if in_dygraph_mode():
         out = core.ops.fused_softmax_mask_upper_triangle(x)
diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py
index 74d015b86b5c9..d57d9a4bdb678 100644
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -16,20 +16,30 @@
 from .tensor.linalg import norm  # noqa: F401
 from .tensor.linalg import cond  # noqa: F401
 from .tensor.linalg import matrix_power  # noqa: F401
+from .tensor.linalg import solve  # noqa: F401
 from .tensor import inverse as inv  # noqa: F401
+from .tensor.linalg import eigvals  # noqa: F401
 from .tensor.linalg import multi_dot  # noqa: F401
 from .tensor.linalg import matrix_rank
 from .tensor.linalg import svd
 from .tensor.linalg import eigh  # noqa: F401
+from .tensor.linalg import det
+from .tensor.linalg import slogdet
+from .tensor.linalg import pinv
 
 __all__ = [
     'cholesky',  #noqa
     'norm',
     'cond',
     'inv',
+    'eigvals',
     'multi_dot',
     'matrix_rank',
     'svd',
     'matrix_power',
-    'eigh'
+    'det',
+    'slogdet',
+    'eigh',
+    'pinv',
+    'solve'
 ]
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index fcfbea438d7cc..fdd370d7f81e7 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1274,7 +1274,8 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
     x_dim = len(x.shape)
 
-    if mode == "constant" and isinstance(pad, list) and len(pad) == x_dim * 2:
+    if mode == "constant" and isinstance(pad, (
+            list, tuple)) and len(pad) == x_dim * 2:
         return layers.pad(x, pad, pad_value=value)
 
     assert x_dim in [
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index db73e56f879a7..89843885c8a12 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -86,7 +86,8 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
     check_type(p, 'p', (float, int), 'normalize')
     check_type(axis, 'axis', (int), 'normalize')
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'normalize')
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'normalize')
     if len(x.shape) == 1 and axis != 0 and axis != -1:
         raise ValueError(
             "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index e065ee91c6840..cc28eead522d4 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -22,6 +22,8 @@
 import warnings
 from ..fluid.dygraph import base as imperative_base
 from collections import defaultdict
+import numpy as np
+import time
 
 import paddle
 from paddle import _C_ops
@@ -208,26 +210,29 @@ def __init__(self,
         }
 
     def _create_master_weight(self, param):
-        assert isinstance(self.helper, LayerHelper)
-
-        var_name = param.name + "_fp32_master"
-        var_name = unique_name.generate(var_name)
-        var = layers.create_global_var(
-            name=var_name,
-            shape=param.shape,
-            value=0,
-            dtype='float32',
-            persistable=True)
-        block = self.helper.startup_program.global_block()
-        block.append_op(
-            type="cast",
-            inputs={"X": [param]},
-            outputs={"Out": [var]},
-            attrs={
-                "in_dtype": param.dtype,
-                "out_dtype": core.VarDesc.VarType.FP32
-            })
-        self._master_weights[param.name] = var
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
+        else:
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
         return var
 
     def _get_accumulator(self, name, param):
@@ -317,12 +322,13 @@ def _append_optimize_op(self, block, param_and_grad):
                 self._beta1, Variable) else self._beta1.numpy().item(0)
             _beta2 = self._beta2 if not isinstance(
                 self._beta2, Variable) else self._beta2.numpy().item(0)
-            _, _, _, _, _ = _C_ops.adam(
+            _, _, _, _, _, _ = _C_ops.adam(
                 param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
-                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
-                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
-                1000, 'beta1', _beta1, 'beta2', _beta2)
+                beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
+                moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
+                'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
+                'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
+                'beta2', _beta2, 'multi_precision', find_master)
 
             return None
 
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0efc40d3300b3..34fb201d8ccaf 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -18,7 +18,7 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 from ..fluid.dygraph import base as imperative_base
-from collections import Callable
+from collections.abc import Callable
 import paddle
 
 _C_ops = core.ops
@@ -297,13 +297,15 @@ def _append_optimize_op(self, block, param_and_grad):
                 self._beta1, Variable) else self._beta1.numpy().item(0)
             _beta2 = self._beta2 if not isinstance(
                 self._beta2, Variable) else self._beta2.numpy().item(0)
-            _, _, _, _, _ = _C_ops.adamw(
+
+            _, _, _, _, _, _ = _C_ops.adamw(
                 param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
-                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
-                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
-                1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff,
-                "lr_ratio", lr_ratio_)
+                beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
+                moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
+                'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
+                'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
+                'beta2', _beta2, 'coeff', self._coeff, 'multi_precision',
+                find_master, "lr_ratio", lr_ratio_)
 
             return None
 
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index d33c9ecbb4167..fde3b28607344 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -170,7 +170,7 @@ def __init__(self,
             'regularization_method': self._regularization_method,
             'regularization_coeff': self._regularization_coeff,
         }
-
+        '''
         if framework.in_dygraph_mode():
             self.helper = LayerHelper(self.__class__.__name__)
             if isinstance(self._parameter_list[0], dict):
@@ -180,6 +180,7 @@ def __init__(self,
             else:
                 for p in parameters:
                     self._add_accumulator(self._velocity_acc_str, p)
+        '''
 
     def _update_regularization(self, weight_decay):
         reg_method = ""
@@ -194,26 +195,29 @@ def _update_regularization(self, weight_decay):
         return reg_method, reg_coeff
 
     def _create_master_weight(self, param):
-        assert isinstance(self.helper, LayerHelper)
-
-        var_name = param.name + "_fp32_master"
-        var_name = unique_name.generate(var_name)
-        var = layers.create_global_var(
-            name=var_name,
-            shape=param.shape,
-            value=0,
-            dtype='float32',
-            persistable=True)
-        block = self.helper.startup_program.global_block()
-        block.append_op(
-            type="cast",
-            inputs={"X": [param]},
-            outputs={"Out": [var]},
-            attrs={
-                "in_dtype": param.dtype,
-                "out_dtype": core.VarDesc.VarType.FP32
-            })
-        self._master_weights[param.name] = var
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
+        else:
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
         return var
 
     def _get_accumulator(self, name, param):
@@ -239,10 +243,15 @@ def _get_accumulator(self, name, param):
         return self._accumulators[name][target_name]
 
     def _create_accumulators(self, block, parameters):
+        '''
         if framework.in_dygraph_mode():
             return
-
+        '''
         assert isinstance(block, framework.Block)
+
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
         for p in parameters:
             if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
                 master_p = self._create_master_weight(p)
@@ -291,21 +300,23 @@ def _append_optimize_op(self, block, param_and_grad):
                 regularization_method = ""
                 regularization_coeff = 0
 
+        find_master = self._multi_precision and param_and_grad[
+            0].dtype == core.VarDesc.VarType.FP16
+        master_weight = (self._master_weights[param_and_grad[0].name]
+                         if find_master else None)
+
         if framework.in_dygraph_mode():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
-            _, _ = _C_ops.momentum(
+            _, _, _ = _C_ops.momentum(
                 param_and_grad[0], param_and_grad[1], velocity_acc, lr,
-                param_and_grad[0], velocity_acc, 'mu', self._momentum,
-                'use_nesterov', self._use_nesterov, 'regularization_method',
-                regularization_method, 'regularization_coeff',
-                regularization_coeff)
-            return None
+                master_weight, param_and_grad[0], velocity_acc, master_weight,
+                'mu', self._momentum, 'use_nesterov', self._use_nesterov,
+                'regularization_method', regularization_method,
+                'regularization_coeff', regularization_coeff, 'multi_precision',
+                find_master)
 
-        find_master = self._multi_precision and param_and_grad[
-            0].dtype == core.VarDesc.VarType.FP16
-        master_weight = (self._master_weights[param_and_grad[0].name]
-                         if find_master else None)
+            return None
 
         attrs = {
             "mu": self._momentum,
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 93f34b2297943..0f463b0c7d941 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -48,6 +48,7 @@
 from ..fluid.layers.nn import py_func  # noqa: F401
 from ..fluid.parallel_executor import ParallelExecutor  # noqa: F401
 from ..fluid.param_attr import WeightNormParamAttr  # noqa: F401
+from ..fluid.optimizer import ExponentialMovingAverage  # noqa: F401
 from ..fluid.io import save  # noqa: F401
 from ..fluid.io import load  # noqa: F401
 from ..fluid.io import load_program_state  # noqa: F401
@@ -76,6 +77,7 @@
            'ParallelExecutor',
            'program_guard',
            'WeightNormParamAttr',
+           'ExponentialMovingAverage',
            'default_main_program',
            'default_startup_program',
            'Program',
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 318cbb8120716..080a06455a681 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -46,9 +46,12 @@
 from .linalg import histogram  # noqa: F401
 from .linalg import mv  # noqa: F401
 from .linalg import matrix_power  # noqa: F401
+from .linalg import eigvals  # noqa: F401
 from .linalg import multi_dot  # noqa: F401
 from .linalg import svd  # noqa: F401
 from .linalg import eigh  # noqa: F401
+from .linalg import pinv  # noqa: F401
+from .linalg import solve  # noqa: F401
 from .logic import equal  # noqa: F401
 from .logic import greater_equal  # noqa: F401
 from .logic import greater_than  # noqa: F401
@@ -214,6 +217,8 @@
 from .array import create_array  # noqa: F401
 
 from .einsum import einsum  # noqa: F401
+from . import fft
+from . import signal
 
 #this list used in math_op_patch.py for _binary_creator_
 tensor_method_func  = [ #noqa
@@ -230,6 +235,7 @@
            'histogram',
            'mv',
            'matrix_power',
+           'eigvals',
            'abs',
            'acos',
            'all',
@@ -381,6 +387,8 @@
            'bitwise_not',
            'broadcast_tensors',
            'uniform_',
+           'multi_dot',
+           'solve',
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 3a86b09c5c393..8d8c2a83de1db 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -35,6 +35,41 @@ def _complex_to_real_dtype(dtype):
         return dtype
 
 
+def _real_to_complex_dtype(dtype):
+    if dtype == core.VarDesc.VarType.FP32:
+        return core.VarDesc.VarType.COMPLEX64
+    elif dtype == core.VarDesc.VarType.FP64:
+        return core.VarDesc.VarType.COMPLEX128
+    else:
+        return dtype
+
+
+def is_complex(x):
+    dtype = x.dtype
+    is_complex_dtype = (dtype == core.VarDesc.VarType.COMPLEX64 or
+                        dtype == core.VarDesc.VarType.COMPLEX128)
+    return is_complex_dtype
+
+
+def is_floating_point(x):
+    dtype = x.dtype
+    is_fp_dtype = (dtype == core.VarDesc.VarType.FP32 or
+                   dtype == core.VarDesc.VarType.FP64 or
+                   dtype == core.VarDesc.VarType.FP16 or
+                   dtype == core.VarDesc.VarType.BF16)
+    return is_fp_dtype
+
+
+def is_interger(x):
+    dtype = x.dtype
+    is_int_dtype = (dtype == core.VarDesc.VarType.UINT8 or
+                    dtype == core.VarDesc.VarType.INT8 or
+                    dtype == core.VarDesc.VarType.INT16 or
+                    dtype == core.VarDesc.VarType.INT32 or
+                    dtype == core.VarDesc.VarType.INT64)
+    return is_int_dtype
+
+
 def real(x, name=None):
     """
     Returns a new tensor containing real values of the input tensor.
diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py
new file mode 100644
index 0000000000000..98ca858c0eb85
--- /dev/null
+++ b/python/paddle/tensor/fft.py
@@ -0,0 +1,1609 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Sequence
+import numpy as np
+import paddle
+from .attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
+from ..fluid.framework import in_dygraph_mode
+from .. import _C_ops
+from ..fluid.data_feeder import check_variable_and_dtype
+from ..fluid.layer_helper import LayerHelper
+
+__all__ = [
+    'fft',
+    'fft2',
+    'fftn',
+    'ifft',
+    'ifft2',
+    'ifftn',
+    'rfft',
+    'rfft2',
+    'rfftn',
+    'irfft',
+    'irfft2',
+    'irfftn',
+    'hfft',
+    'hfft2',
+    'hfftn',
+    'ihfft',
+    'ihfft2',
+    'ihfftn',
+    'fftfreq',
+    'rfftfreq',
+    'fftshift',
+    'ifftshift',
+]
+
+
+def _check_normalization(norm):
+    if norm not in ['forward', 'backward', 'ortho']:
+        raise ValueError(
+            "Unexpected norm: {}. Norm should be forward, backward or ortho".
+            format(norm))
+
+
+def _check_fft_n(n):
+    if not isinstance(n, int):
+        raise ValueError(
+            "Invalid FFT argument n({}), it shoule be an integer.".format(n))
+    if n <= 0:
+        raise ValueError(
+            "Invalid FFT argument n({}), it should be positive.".format(n))
+
+
+def _check_fft_shape(x, s):
+    ndim = x.ndim
+    if not isinstance(s, Sequence):
+        raise ValueError(
+            "Invaid FFT argument s({}), it should be a sequence of integers.")
+
+    if len(s) > ndim:
+        raise ValueError(
+            "Length of FFT argument s should not be larger than the rank of input. "
+            "Received s: {}, rank of x: {}".format(s, ndim))
+    for size in s:
+        if not isinstance(size, int) or size <= 0:
+            raise ValueError("FFT sizes {} contains invalid value ({})".format(
+                s, size))
+
+
+def _check_fft_axis(x, axis):
+    ndim = x.ndim
+    if not isinstance(axis, int):
+        raise ValueError(
+            "Invalid FFT axis ({}), it shoule be an integer.".format(axis))
+    if axis < -ndim or axis >= ndim:
+        raise ValueError(
+            "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
+                axis, ndim, ndim))
+
+
+def _check_fft_axes(x, axes):
+    ndim = x.ndim
+    if not isinstance(axes, Sequence):
+        raise ValueError(
+            "Invalid FFT axes ({}), it should be a sequence of integers.".
+            format(axes))
+    if len(axes) > ndim:
+        raise ValueError(
+            "Length of fft axes should not be larger than the rank of input. "
+            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
+    for axis in axes:
+        if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
+            raise ValueError(
+                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".
+                format(axes, axis, ndim, ndim))
+
+
+def _resize_fft_input(x, s, axes):
+    if len(s) != len(axes):
+        raise ValueError("length of `s` should equals length of `axes`.")
+    shape = x.shape
+    ndim = x.ndim
+
+    axes_to_pad = []
+    paddings = []
+    axes_to_slice = []
+    slices = []
+    for i, axis in enumerate(axes):
+        if shape[axis] < s[i]:
+            axes_to_pad.append(axis)
+            paddings.append(s[i] - shape[axis])
+        elif shape[axis] > s[i]:
+            axes_to_slice.append(axis)
+            slices.append((0, s[i]))
+
+    if axes_to_slice:
+        x = paddle.slice(
+            x,
+            axes_to_slice,
+            starts=[item[0] for item in slices],
+            ends=[item[1] for item in slices])
+    if axes_to_pad:
+        padding_widths = [0] * (2 * ndim)
+        for axis, pad in zip(axes_to_pad, paddings):
+            padding_widths[2 * axis + 1] = pad
+        x = paddle.nn.functional.pad(x, padding_widths)
+    return x
+
+
+def _normalize_axes(x, axes):
+    ndim = x.ndim
+    return [item if item >= 0 else (item + ndim) for item in axes]
+
+
+def _check_at_least_ndim(x, rank):
+    if x.ndim < rank:
+        raise ValueError("The rank of the input ({}) should >= {}".format(
+            x.ndim, rank))
+
+
+# public APIs 1d
+def fft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Calculate one-dimensional discrete Fourier transform.
+
+    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to 
+    calculate the 1-D * n * point discrete Fourier transform (DFT).
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. If `n` is less than 
+            the length input, the input will be cropped. If larger, the input is filled 
+            with zeros. If `n` is not given, the input length along the axis specified 
+            by `axis` is used.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        by `axis`, or the last one if `axis` is not specified.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.exp(3j * np.pi * np.arange(7) / 7)
+            xp = paddle.to_tensor(x)
+            fft_xp = paddle.fft.fft(xp).numpy()
+            print(fft_xp)
+            #  [1.+1.25396034e+00j 1.+4.38128627e+00j 1.-4.38128627e+00j
+            #   1.-1.25396034e+00j 1.-4.81574619e-01j 1.+8.88178420e-16j
+            #   1.+4.81574619e-01j]
+
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fft_r2c(
+            x, n, axis, norm, forward=True, onesided=False, name=name)
+    else:
+        return fft_c2c(x, n, axis, norm, forward=True, name=name)
+
+
+def ifft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Compute the 1-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the 1-D *n*-point discrete Fourier transform 
+    computed by `fft`.  In other words, ``ifft(fft(x)) == x`` to within numerical accuracy.
+
+    The input should be ordered in the same way as is returned by `fft`,
+    i.e.,
+
+    * ``x[0]`` should contain the zero frequency term,
+    * ``x[1:n//2]`` should contain the positive-frequency terms,
+    * ``x[n//2 + 1:]`` should contain the negative-frequency terms, in
+      increasing order starting from the most negative frequency.
+
+    For an even number of input points, ``x[n//2]`` represents the sum of
+    the values at the positive and negative Nyquist frequencies, as the two
+    are aliased together. 
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. If `n` is less than 
+            the length input, the input will be cropped. If larger, the input is filled 
+            with zeros. If `n` is not given, the input length along the axis specified 
+            by `axis` is used.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        by `axis`, or the last one if `axis` is not specified.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.exp(3j * np.pi * np.arange(7) / 7)
+            xp = paddle.to_tensor(x)
+            ifft_xp = paddle.fft.ifft(xp).numpy()
+            print(ifft_xp)
+            #  [0.14285714+1.79137191e-01j 0.14285714+6.87963741e-02j
+            #   0.14285714+1.26882631e-16j 0.14285714-6.87963741e-02j
+            #   0.14285714-1.79137191e-01j 0.14285714-6.25898038e-01j
+            #   0.14285714+6.25898038e-01j]
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fft_r2c(
+            x, n, axis, norm, forward=False, onesided=False, name=name)
+    else:
+        return fft_c2c(x, n, axis, norm, forward=False, name=name)
+
+
+def rfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    The one dimensional FFT for real input.
+
+    This function computes the one dimensional *n*-point discrete Fourier
+    Transform (DFT) of a real-valued tensor by means of an efficient algorithm
+    called the Fast Fourier Transform (FFT).
+
+    When the DFT is computed for purely real input, the output is
+    Hermitian-symmetric. This function does not compute the negative frequency 
+    terms, and the length of the transformed axis of the output is therefore 
+    ``n//2 + 1``.
+
+    Args:
+        x(Tensor) : Real-valued input tensor 
+        n(int, optional): Number of points along transformation axis in the 
+            input to use. If `n` is smaller than the length of the input, the 
+            input is cropped. If it is larger, the input is padded with zeros. 
+            If `n` is not given, the length of the input along the axis 
+            specified by `axis` is used.
+        axis(int, optional): Axis over which to compute the FFT. Default value 
+            is last axis.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward  pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor
+
+    Raises:
+
+
+    Examples:
+    .. code-block:: python
+        import paddle
+
+        x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
+        print(paddle.fft.rfft(x))
+        # Tensor(shape=[3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [ (1+0j), -1j    , (-1+0j)])
+    """
+    return fft_r2c(x, n, axis, norm, forward=True, onesided=True, name=name)
+
+
+def irfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Computes the inverse of `rfft`.
+
+    This function calculates the inverse of the one-dimensional *n* point discrete 
+    Fourier transform of the actual input calculated by "rfft". In other words, 
+    ``irfft(rfft(a),len(a)) == a`` is within the numerical accuracy range.
+
+    The input shall be in the form of "rfft", i.e. the actual zero frequency term, 
+    followed by the complex positive frequency term, in the order of increasing frequency. 
+    Because the discrete Fourier transform of the actual input is Hermite symmetric, 
+    the negative frequency term is regarded as the complex conjugate term of the corresponding 
+    positive frequency term.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. For `n` output
+            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater 
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            along the ` axis'.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1``
+        in some cases.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([1, -1j, -1])
+            xp = paddle.to_tensor(x)
+            irfft_xp = paddle.fft.irfft(xp).numpy()
+            print(irfft_xp)
+            #  [0. 0. 0. 4.]
+
+    """
+    return fft_c2r(x, n, axis, norm, forward=False, name=name)
+
+
+def hfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Compute the FFT of a signal that has Hermitian symmetry, a real
+    spectrum.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. For `n` output
+            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater 
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            along the ` axis'.
+        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in 
+        some cases.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([1, -1j, -1])
+            xp = paddle.to_tensor(x)
+            hfft_xp = paddle.fft.hfft(xp).numpy()
+            print(hfft_xp)
+            #  [0. 0. 0. 4.]
+    """
+
+    return fft_c2r(x, n, axis, norm, forward=True, name=name)
+
+
+def ihfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    The inverse FFT of a signal that has Hermitian symmetry.
+
+    This function computes the one dimensional *n*-point inverse FFT of a signal 
+    that has Hermitian symmetry by means of an efficient algorithm called 
+    the Fast Fourier Transform (FFT).
+
+    When the DFT is computed for purely real input, the output is
+    Hermitian-symmetric. This function does not compute the negative frequency 
+    terms, and the length of the transformed axis of the output is therefore 
+    ``n//2 + 1``.
+
+    Args:
+        x(Tensor): Input tensor.
+        n(int, optional): The number of points along transformation axis in the 
+            input to use.  If `n` is smaller than the length of the input, the 
+            input is cropped.  If it is larger, the input is padded with zeros. 
+            If `n` is not given, the length of the input along the axis 
+            specified by `axis` is used.
+        axis(int, optional) : Axis over which to compute the inverse FFT. If not
+            given, the last axis is used.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor.
+
+    Examples:
+    .. code-block:: python
+        import paddle 
+
+        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
+        print(paddle.fft.ifft(spectrum))
+        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
+        print(paddle.fft.ihfft(spectrum))
+        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
+        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
+
+    """
+    return fft_r2c(x, n, axis, norm, forward=False, onesided=True, name=name)
+
+
+# public APIs nd
+def fftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D discrete Fourier Transform.
+
+    This function calculates the n-D discrete Fourier transform on any number of axes 
+    in the M-D array by fast Fourier transform (FFT).
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+            This corresponds to ``n`` for ``fft(x, n)``.
+            Along any axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used.
+        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
+            axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = x = np.mgrid[:4, :4, :4][1]
+            xp = paddle.to_tensor(x)
+            fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy()
+            print(fftn_xp)
+            #  [[[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fftn_r2c(
+            x, s, axes, norm, forward=True, onesided=False, name=name)
+    else:
+        return fftn_c2c(x, s, axes, norm, forward=True, name=name)
+
+
+def ifftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the N-D discrete
+    Fourier Transform over any number of axes in an M-D array by
+    means of the Fast Fourier Transform (FFT).  In other words,
+    ``ifftn(fftn(x)) == x`` to within numerical accuracy.
+
+    The input, analogously to `ifft`, should be ordered in the same way as is
+    returned by `fftn`, i.e., it should have the term for zero frequency
+    in all axes in the low-order corner, the positive frequency terms in the
+    first half of all axes, the term for the Nyquist frequency in the middle
+    of all axes and the negative frequency terms in the second half of all
+    axes, in order of decreasingly negative frequency.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+            This corresponds to ``n`` for ``fft(x, n)``.
+            Along any axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used.
+        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
+            axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+        
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.eye(3)
+            xp = paddle.to_tensor(x)
+            ifftn_xp = paddle.fft.ifftn(xp, axes=(1,)).numpy()
+            print(ifftn_xp)
+
+            #   [[ 0.33333333+0.j          0.33333333+0.j          0.33333333-0.j        ]
+            #   [ 0.33333333+0.j         -0.16666667+0.28867513j -0.16666667-0.28867513j]
+            #   [ 0.33333333+0.j         -0.16666667-0.28867513j -0.16666667+0.28867513j]]
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fftn_r2c(
+            x, s, axes, norm, forward=False, onesided=False, name=name)
+    else:
+        return fftn_c2c(x, s, axes, norm, forward=False, name=name)
+
+
+def rfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    The N dimensional FFT for real input.
+
+    This function computes the N-dimensional discrete Fourier Transform over
+    any number of axes in an M-dimensional real array by means of the Fast
+    Fourier Transform (FFT).  By default, all axes are transformed, with the
+    real transform performed over the last axis, while the remaining
+    transforms are complex.
+
+    The transform for real input is performed over the last transformation
+    axis, as by `rfft`, then the transform over the remaining axes is
+    performed as by `fftn`.  The order of the output is as for `rfft` for the
+    final transformation axis, and as for `fftn` for the remaining
+    transformation axes.
+
+    Args:
+        x(Tensor) : Input tensor, taken to be real.
+        s(Sequence[int]) : Shape to use from the exec fft. The final element of 
+            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
+            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
+            the given shape is smaller than that of the input, the input is 
+            cropped.  If it is larger, the input is padded with zeros. if `s` is 
+            not given, the shape of the input along the axes specified by `axes` 
+            is used.
+        axes(Sequence[int]) : Axes over which to compute the FFT.  If not given, 
+            the last ``len(s)`` axes are used, or all axes if `s` is also not 
+            specified.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor): complex tensor
+
+
+    Raises:
+        ValueError: If `s` and `axes` have different length.
+
+    Examples:
+    .. code-block:: python
+        import paddle
+
+        # default, all axis will be used to exec fft
+        x = paddle.ones((2, 3, 4))
+        print(paddle.fft.rfftn(x))
+        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[[(24+0j), 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]],
+        #
+        #         [[0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]]])
+
+        # use axes(2, 0)
+        print(paddle.fft.rfftn(x, axes=(2, 0)))
+        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[[(24+0j), 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]],
+        #
+        #         [[0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]]])
+
+    """
+    return fftn_r2c(x, s, axes, norm, forward=True, onesided=True, name=name)
+
+
+def irfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Computes the inverse of `rfftn`.
+
+    This function computes the inverse of the N-D discrete
+    Fourier Transform for real input over any number of axes in an
+    M-D array by means of the Fast Fourier Transform (FFT). In
+    other words, ``irfftn(rfftn(x), x.shape) == x`` to within numerical
+    accuracy. (The ``a.shape`` is necessary like ``len(a)`` is for `irfft`,
+    and for the same reason.)
+
+    The input should be ordered in the same way as is returned by `rfftn`,
+    i.e., as for `irfft` for the final transformation axis, and as for `ifftn`
+    along all the other axes.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): The length of the output transform axis. 
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
+            number of input points used along this axis, except for the last axis,
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
+            the shape indicated by `s` is smaller than that of the input, the input 
+            is cropped. If it is larger, the input is padded with zeros. 
+            If `s` is not given, the shape of the input along the axes specified by axes 
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            ``k`` is the length of the input along that axis.
+        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
+            `len(s)` axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or by a combination of `s` or `x`, as explained in the parameters section above. The length of 
+        each transformed axis is as given by the corresponding element of `s`, or the length of the input
+        in every axis except for the last one if `s` is not given. In the final transformed axis the length
+        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final 
+        transformed axis of the input. To get an odd number of output points in the final axis, 
+        `s` must be specified.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            irfftn_xp = paddle.fft.irfftn(xp).numpy()
+            print(irfftn_xp)
+            #  [ 2.25 -1.25  0.25  0.75]
+    
+    """
+    return fftn_c2r(x, s, axes, norm, forward=False, name=name)
+
+
+def hfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D FFT of Hermitian symmetric complex input, i.e., a
+    signal with a real spectrum.
+
+    This function calculates the n-D discrete Fourier transform of Hermite symmetric 
+    complex input on any axis in M-D array by fast Fourier transform (FFT). 
+    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range. 
+    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary 
+    for the same reason that ``irfft` requires ``x.shape``.)
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): The length of the output transform axis. 
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
+            number of input points used along this axis, except for the last axis,
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
+            the shape indicated by `s` is smaller than that of the input, the input 
+            is cropped. If it is larger, the input is padded with zeros. 
+            If `s` is not given, the shape of the input along the axes specified by axes 
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            ``k`` is the length of the input along that axis.
+        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
+            `len(s)` axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or 
+        a combination of `s` or `X`.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            hfftn_xp = paddle.fft.hfftn(xp).numpy()
+            print(hfftn_xp)
+            #  [ 9.  3.  1. -5.]
+
+
+    """
+    return fftn_c2r(x, s, axes, norm, forward=True, name=name)
+
+
+def ihfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    The n dimensional inverse FFT of a signal that has Hermitian symmetry.
+
+    This function computes the n dimensional inverse FFT over any number of axes 
+    in an M-dimensional of a signal that has Hermitian symmetry by means of an 
+    efficient algorithm called the Fast Fourier Transform (FFT).
+
+    Args:
+        x(Tensor): Input tensor.
+        s(Sequence[int], optional) : Shape (length along each transformed axis) 
+            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis 
+            1, etc.). Along any axis, if the given shape is smaller than that 
+            of the input, the input is cropped. If it is larger, the input is 
+            padded with zeros. if `s` is not given, the shape of the input 
+            along the axes specified by `axes` is used.
+        axis(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
+            given, the last axis is used.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor.
+
+    Examples:
+    .. code-block:: python
+        import paddle 
+
+        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
+        print(paddle.fft.ifft(spectrum))
+        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
+        print(paddle.fft.ihfft(spectrum))
+        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
+        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
+
+    """
+    return fftn_r2c(x, s, axes, norm, forward=False, onesided=True, name=name)
+
+
+# public APIs 2d
+def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D discrete Fourier Transform
+
+    This function computes the N-D discrete Fourier Transform
+    over any axes in an M-D array by means of the
+    Fast Fourier Transform (FFT). By default, the transform is computed over
+    the last two axes of the input array, i.e., a 2-dimensional FFT.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+            Along each axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
+            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or the last two axes if `axes` is not given.
+    
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:2, :2][1]
+            xp = paddle.to_tensor(x)
+            fft2_xp = paddle.fft.fft2(xp).numpy()
+            print(fft2_xp)
+            #  [[ 2.+0.j -2.+0.j]
+            #   [ 0.+0.j  0.+0.j]]
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return fftn(x, s, axes, norm, name)
+
+
+def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the 2-D discrete Fourier
+    Transform over any number of axes in an M-D array by means of
+    the Fast Fourier Transform (FFT). In other words, ``ifft2(fft2(x)) == x``
+    to within numerical accuracy. By default, the inverse transform is
+    computed over the last two axes of the input array.
+
+    The input, analogously to `ifft`, should be ordered in the same way as is
+    returned by `fft2`, i.e., it should have the term for zero frequency
+    in the low-order corner of the two axes, the positive frequency terms in
+    the first half of these axes, the term for the Nyquist frequency in the
+    middle of the axes and the negative frequency terms in the second half of
+    both axes, in order of decreasingly negative frequency.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+            Along each axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
+            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or the last two axes if `axes` is not given.
+
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:2, :2][1]
+            xp = paddle.to_tensor(x)
+            ifft2_xp = paddle.fft.ifft2(xp).numpy()
+            print(ifft2_xp)
+            #  [[ 0.5+0.j -0.5+0.j]
+            #   [ 0. +0.j  0. +0.j]]
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return ifftn(x, s, axes, norm, name)
+
+
+def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    The two dimensional FFT with real tensor input.
+
+    This is really just `rfftn` with different default behavior.
+    For more details see `rfftn`.
+
+    Args:
+        x(Tensor): Input tensor, taken to be real.
+        s(Sequence[int]) : Shape of the FFT.
+        axes(Sequence[int], optional): Axes over which to compute the FFT.
+        norm(str, optional) : {"backward", "ortho", "forward"}, 
+            default is "backward". Indicates which direction of the 
+            forward/backward pair of transforms is scaled and with what 
+            normalization factor.
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns: 
+        out(Tensor): The result of the real 2-D FFT.
+
+    Raises:
+
+
+    Examples:
+
+    .. code-block:: python
+        import paddle
+        import numpy as np
+
+        x = paddle.to_tensor(np.mgrid[:5, :5][0].astype(np.float32))
+        print(paddle.fft.rfft2(x))
+        # Tensor(shape=[5, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[ (50+0j)                                        ,  (1.1920928955078125e-07+0j)                    ,  0j                                             ],
+        #         [(-12.5+17.204774856567383j)                     , (-9.644234211236835e-08+7.006946134424652e-08j) ,  0j                                             ],
+        #         [(-12.500000953674316+4.061495304107666j)        , (3.6837697336977726e-08-1.1337477445749755e-07j),  0j                                             ],
+        #         [(-12.500000953674316-4.061495304107666j)        , (3.6837697336977726e-08+1.1337477445749755e-07j),  0j                                             ],
+        #         [(-12.5-17.204774856567383j)                     , (-9.644234211236835e-08-7.006946134424652e-08j) ,  0j                                             ]])
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return rfftn(x, s, axes, norm, name)
+
+
+def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Computes the inverse of `rfft2`.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
+        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
+            must be two-dimensional. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+    
+    Returns:
+        Real tensor. The result of the inverse real 2-D FFT.
+
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            irfft2_xp = paddle.fft.irfft2(xp).numpy()
+            print(irfft2_xp)
+            #  [[ 2.375 -1.125  0.375  0.875]
+            #   [ 0.125  0.125  0.125  0.125]]
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return irfftn(x, s, axes, norm, name)
+
+
+def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D FFT of a Hermitian complex array.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape of the real output. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be 
+            two-dimensional. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. The real result of the 2-D Hermitian complex real FFT.
+    
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            hfft2_xp = paddle.fft.hfft2(xp).numpy()
+            print(hfft2_xp)
+            #  [[19.  7.  3. -9.]
+            #   [ 1.  1.  1.  1.]]
+
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return hfftn(x, s, axes, norm, name)
+
+
+def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the two dimensional inverse FFT of a real spectrum.
+
+    This is really `ihfftn` with different defaults.
+    For more details see `ihfftn`.
+
+    Args:
+        x(Tensor): Input tensor
+        s(Sequence[int], optional): Shape of the real input to the inverse FFT.
+        axes(Sequance[int], optional): The axes over which to compute the 
+            inverse fft. Default is the last two axes.
+        norm(str, optional): {"backward", "ortho", "forward"}. Default is 
+        "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : The result of the inverse real 2-D FFT.
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return ihfftn(x, s, axes, norm, name)
+
+
+# public APIs utilities
+def fftfreq(n, d=1.0, dtype=None, name=None):
+    """
+    Return the Discrete Fourier Transform sample frequencies.
+
+    The returned float array `f` contains the frequency bin centers in cycles
+    per unit of the sample spacing (with zero at the start).  For instance, if
+    the sample spacing is in seconds, then the frequency unit is cycles/second.
+
+    Given input length `n` and a sample spacing `d`::
+
+      f = [0, 1, ...,   n/2-1,     -n/2, ..., -1] / (d*n)   if n is even
+      f = [0, 1, ..., (n-1)/2, -(n-1)/2, ..., -1] / (d*n)   if n is odd
+
+    Args:
+        n (int): Dimension inputed.
+        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. A tensor of length 'n' containing the sampling frequency.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            scalar_temp = 0.5
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            print(fftfreq_xp)
+
+            #  Tensor(shape=[5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #           [ 0.        ,  0.40000001,  0.80000001, -0.80000001, -0.40000001])
+    """
+
+    dtype = paddle.framework.get_default_dtype()
+    val = 1.0 / (n * d)
+    pos_max = (n + 1) // 2
+    neg_max = n // 2
+    indices = paddle.arange(-neg_max, pos_max, dtype=dtype, name=name)
+    indices = paddle.roll(indices, -neg_max, name=name)
+    return indices * val
+
+
+def rfftfreq(n, d=1.0, dtype=None, name=None):
+    """
+    Return the Discrete Fourier Transform sample frequencies.
+
+    The returned floating-point array "F" contains the center of the frequency unit, 
+    and the unit is the number of cycles of the sampling interval (the starting point is zero). 
+
+    Given input length `n` and a sample spacing `d`::
+
+      f = [0, 1, ...,     n/2-1,     n/2] / (d*n)   if n is even
+      f = [0, 1, ..., (n-1)/2-1, (n-1)/2] / (d*n)   if n is odd
+
+    the Nyquist frequency component is considered to be positive.
+
+    Args:
+        n (int): Dimension inputed.
+        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. A tensor of length ``n//2 + 1`` containing the sample frequencies.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            scalar_temp = 0.3
+            n = x.size
+            rfftfreq_xp = paddle.fft.rfftfreq(n, d=scalar_temp)
+            print(rfftfreq_xp)
+
+            #  Tensor(shape=[3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #           [0.        , 0.66666669, 1.33333337])
+
+    """
+
+    dtype = paddle.framework.get_default_dtype()
+    val = 1.0 / (n * d)
+    pos_max = 1 + n // 2
+    indices = paddle.arange(0, pos_max, dtype=dtype, name=name)
+    return indices * val
+
+
+def fftshift(x, axes=None, name=None):
+    """
+    Shift the zero-frequency component to the center of the spectrum.
+
+    This function swaps half spaces for all the axes listed (all by default).
+    Note that ``y[0]`` is the Nyquist component only if ``len(x)`` is even.
+
+    Args:
+        n (int): Dimension inputed.
+        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
+            Default is None.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. The shifted tensor.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            scalar_temp = 0.3
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            res = paddle.fft.fftshift(fftfreq_xp).numpy()
+            print(res)
+            #  [-1.3333334 -0.6666667  0.         0.6666667  1.3333334]
+
+    """
+    shape = paddle.shape(x)
+    if axes is None:
+        # shift all axes
+        rank = paddle.rank(x).reshape([1])
+        axes = axes or paddle.arange(0, rank)
+        shifts = [size // 2 for size in shape]
+    elif isinstance(axes, int):
+        shifts = shape[axes] // 2
+    else:
+        shifts = [shape[ax] // 2 for ax in axes]
+    return paddle.roll(x, shifts, axes, name=name)
+
+
+def ifftshift(x, axes=None, name=None):
+    """
+    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the 
+    odd length 'x' is different. An example.
+
+    Args:
+        n (int): Dimension inputed.
+        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
+            Default is None.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. The shifted tensor.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            scalar_temp = 0.3
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            res = paddle.fft.ifftshift(fftfreq_xp).numpy()
+            print(res)
+            #  [ 1.3333334 -1.3333334 -0.6666667  0.         0.6666667]
+
+    """
+    shape = paddle.shape(x)
+    if axes is None:
+        # shift all axes
+        rank = paddle.rank(x).reshape([1])
+        axes = axes or paddle.arange(0, rank)
+        shifts = [-size // 2 for size in shape]
+    elif isinstance(axes, int):
+        shifts = -shape[axes] // 2
+    else:
+        shifts = [-shape[ax] // 2 for ax in axes]
+    return paddle.roll(x, shifts, axes, name=name)
+
+
+# internal functions
+def fft_c2c(x, n, axis, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+
+    axis = axis or -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2c'
+
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fft_r2c(x, n, axis, norm, forward, onesided, name):
+    if is_interger(x):
+        x = paddle.cast(x, paddle.get_default_dtype())
+    _check_normalization(norm)
+    axis = axis or -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_r2c'
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                 'onesided', onesided)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {
+            'axes': axes,
+            'normalization': norm,
+            'forward': forward,
+            'onesided': onesided,
+        }
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _real_to_complex_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fft_c2r(x, n, axis, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    axis = axis or -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n // 2 + 1]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2r'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        if n is not None:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                     'last_dim_size', n)
+        else:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        if n is not None:
+            attrs['last_dim_size'] = n
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _complex_to_real_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fftn_c2c(x, s, axes, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes).tolist()
+        axes = [axes[i] for i in axes_argsoft]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft]
+
+    if s is not None:
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2c'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fftn_r2c(x, s, axes, norm, forward, onesided, name):
+    if is_interger(x):
+        x = paddle.cast(x, paddle.get_default_dtype())
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes[:-1]).tolist()
+        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft] + [s[-1]]
+
+    if s is not None:
+        x = _resize_fft_input(x, s, axes)
+
+    op_type = 'fft_r2c'
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                 'onesided', onesided)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {
+            'axes': axes,
+            'normalization': norm,
+            'forward': forward,
+            'onesided': onesided,
+        }
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _real_to_complex_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return out
+
+
+def fftn_c2r(x, s, axes, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes[:-1]).tolist()
+        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft] + [s[-1]]
+
+    if s is not None:
+        fft_input_shape = list(s)
+        fft_input_shape[-1] = fft_input_shape[-1] // 2 + 1
+        x = _resize_fft_input(x, fft_input_shape, axes)
+
+    op_type = 'fft_c2r'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        if s:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                     'last_dim_size', s[-1])
+        else:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        if s:
+            attrs["last_dim_size"] = s[-1]
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _complex_to_real_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 8135df35203c1..9f2c4316d542d 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -14,10 +14,12 @@
 
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import check_variable_and_dtype, check_type
+from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid.framework import in_dygraph_mode, _varbase_creator, Variable
 
 from ..fluid.layers import transpose, cast  # noqa: F401
+from ..fluid import layers
+import paddle
 from paddle.common_ops_import import core
 from paddle.common_ops_import import VarDesc
 from paddle import _C_ops
@@ -549,8 +551,8 @@ def cond(x, p=None, name=None):
     Computes the condition number of a matrix or batches of matrices with respect to a matrix norm ``p``.
 
     Args:
-        x (Tensor): The input tensor could be tensor of shape ``(*, m, n)`` where ``*`` is zero or more batch dimensions 
-            for ``p`` in ``(2, -2)``, or of shape ``(*, n, n)`` where every matrix is invertible for any supported ``p``. 
+        x (Tensor): The input tensor could be tensor of shape ``(*, m, n)`` where ``*`` is zero or more batch dimensions
+            for ``p`` in ``(2, -2)``, or of shape ``(*, n, n)`` where every matrix is invertible for any supported ``p``.
             And the input data type could be ``float32`` or ``float64``.
         p (float|string, optional): Order of the norm. Supported values are `fro`, `nuc`, `1`, `-1`, `2`, `-2`,
             `inf`, `-inf`. Default value is `None`, meaning that the order of the norm is `2`.
@@ -605,7 +607,7 @@ def cond(x, p=None, name=None):
             # out_minus_inf.numpy() [1.]
 
             a = paddle.to_tensor(np.random.randn(2, 4, 4).astype('float32'))
-            # a.numpy() 
+            # a.numpy()
             # [[[ 0.14063153 -0.996288    0.7996131  -0.02571543]
             #   [-0.16303636  1.5534962  -0.49919784 -0.04402903]
             #   [-1.1341571  -0.6022629   0.5445269   0.29154757]
@@ -973,8 +975,8 @@ def t(input, name=None):
         return out
 
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'transpose')
+        input, 'input', ['float16', 'float32', 'float64', 'int32',
+                         'int64'], 'transpose')
 
     helper = LayerHelper('t', **locals())
     out = helper.create_variable_for_type_inference(input.dtype)
@@ -1106,17 +1108,17 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
     r"""
     Computes the rank of a matrix.
 
-    The rank of a matrix is the number of singular values that are greater than the specified `tol` threshold when hermitian=False, 
+    The rank of a matrix is the number of singular values that are greater than the specified `tol` threshold when hermitian=False,
     or the number of eigenvalues in absolute value that are greater than the specified `tol` threshold when hermitian=True.
 
     Args:
-        x (Tensor): The input tensor. Its shape should be `[..., m, n]`, where `...` is zero or more batch dimensions. If `x` is a batch 
-            of matrices then the output has the same batch dimensions. The data type of `x` should be float32 or float64. 
-        tol (float,Tensor,optional): the tolerance value. Default: None. If `tol` is not specified, and `sigma` is the largest 
-            singular value (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed 
+        x (Tensor): The input tensor. Its shape should be `[..., m, n]`, where `...` is zero or more batch dimensions. If `x` is a batch
+            of matrices then the output has the same batch dimensions. The data type of `x` should be float32 or float64.
+        tol (float,Tensor,optional): the tolerance value. Default: None. If `tol` is not specified, and `sigma` is the largest
+            singular value (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed
             with formula `tol=sigma * max(m,n) * eps`. Note that if `x` is a batch of matrices, `tol` is computed this way for every batch.
-        hermitian (bool,optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian, 
-            enabling a more efficient method for finding eigenvalues, but `x` is not checked inside the function. Instead, We just use 
+        hermitian (bool,optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian,
+            enabling a more efficient method for finding eigenvalues, but `x` is not checked inside the function. Instead, We just use
             the lower triangular of the matrix to compute.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -1223,7 +1225,7 @@ def bmm(x, y, name=None):
             #output value:
             #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
             out_np = out.numpy()
-            
+
     """
     x_shape = x.shape
     y_shape = y.shape
@@ -1349,6 +1351,109 @@ def __check_input(x, vec):
     return out
 
 
+def det(x):
+    """
+    Calculates determinant value of a square matrix or batches of square matrices.
+    Args:
+        x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the batch of matrices of size
+                    `(*, n, n)` where `*` is one or more batch dimensions.
+    Returns:
+        y (Tensor):the determinant value of a square matrix or batches of square matrices.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+
+        x =  paddle.randn([3,3,3])
+
+        A = paddle.det(x)
+
+        print(A)
+
+        # [ 0.02547996,  2.52317095, -6.15900707])
+
+
+    """
+    if in_dygraph_mode():
+        return core.ops.determinant(x)
+
+    check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'det')
+
+    input_shape = list(x.shape)
+    assert len(input_shape) >= 2,                     \
+            "The x must be at least 2-dimensional, "   \
+            "but received Input x's dimensional: %s.\n" %  \
+            len(input_shape)
+
+    assert (input_shape[-1] == input_shape[-2]),    \
+            "Expect squared input," \
+            "but received %s by %s matrix.\n" \
+            %(input_shape[-2], input_shape[-1]) \
+
+    helper = LayerHelper('determinant', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='determinant', inputs={'Input': [x]}, outputs={'Out': [out]})
+    return out
+
+
+def slogdet(x):
+    """
+    Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
+    The determinant can be computed with ``sign * exp(logabsdet)
+
+    Supports input of float, double
+
+    Note that for matrices that have zero determinant, this returns ``(0, -inf)``
+    Args:
+        x (Tensor): the batch of matrices of size :math:`(*, n, n)`
+            where math:`*` is one or more batch dimensions.
+
+    Returns:
+        y (Tensor): A tensor containing the sign of the determinant and the natural logarithm
+        of the absolute value of determinant, respectively.
+
+    Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x =  paddle.randn([3,3,3])
+
+        A = paddle.slogdet(x)
+
+        print(A)
+
+        # [[ 1.        ,  1.        , -1.        ],
+        # [-0.98610914, -0.43010661, -0.10872950]])
+
+    """
+    if in_dygraph_mode():
+        return core.ops.slogdeterminant(x)
+
+    check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'slogdet')
+
+    input_shape = list(x.shape)
+    assert len(input_shape) >= 2,                     \
+            "The x must be at least 2-dimensional, "   \
+            "but received Input x's dimensional: %s.\n" %  \
+            len(input_shape)
+
+    assert (input_shape[-1] == input_shape[-2]),    \
+            "Expect squared input," \
+            "but received %s by %s matrix.\n" \
+            %(input_shape[-2], input_shape[-1]) \
+
+    helper = LayerHelper('slogdeterminant', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='slogdeterminant', inputs={'Input': [x]}, outputs={'Out': [out]})
+    return out
+
+
 def svd(x, full_matrices=False, name=None):
     r"""
     Computes the singular value decomposition of one matrix or a batch of regular matrices.
@@ -1356,19 +1461,19 @@ def svd(x, full_matrices=False, name=None):
     Let :math:`X` be the input matrix or a batch of input matrices, the output should satisfies:
 
     .. math::
-        X = U * diag(S) * VT 
- 
+        X = U * diag(S) * VT
+
     Args:
         x (Tensor): The input tensor. Its shape should be `[..., N, M]`,
             where `...` is zero or more batch dimensions. N and M can be arbitraty
-            positive number. Note that if x is sigular matrices, the grad is numerical 
-            instable. The data type of x should be float32 or float64. 
-        full_matrices (bool): A flag to control the behavor of svd. 
-            If full_matrices = True, svd op will compute full U and V matrics, 
+            positive number. Note that if x is sigular matrices, the grad is numerical
+            instable. The data type of x should be float32 or float64.
+        full_matrices (bool): A flag to control the behavor of svd.
+            If full_matrices = True, svd op will compute full U and V matrics,
             which means shape of U is `[..., N, N]`, shape of V is `[..., M, M]`. K = min(M, N).
-            If full_matrices = False, svd op will use a economic method to store U and V. 
+            If full_matrices = False, svd op will use a economic method to store U and V.
             which means shape of U is `[..., N, K]`, shape of V is `[..., M, K]`. K = min(M, N).
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1392,9 +1497,9 @@ def svd(x, full_matrices=False, name=None):
             print (vh)
             #VT= [[ 0.51411221,  0.85772294],
             #     [ 0.85772294, -0.51411221]]
-            
+
             # one can verify : U * S * VT == X
-            #                  U * UH == I 
+            #                  U * UH == I
             #                  V * VH == I
     """
 
@@ -1421,7 +1526,7 @@ def svd(x, full_matrices=False, name=None):
 def matrix_power(x, n, name=None):
     r"""
     Computes the n-th power of a square matrix or a batch of square matrices.
-    
+
     Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
     an exponent, the equation should be:
 
@@ -1488,11 +1593,71 @@ def matrix_power(x, n, name=None):
     return out
 
 
+def eigvals(x, name=None):
+    """
+    Compute the eigenvalues of one or more general matrices.
+
+    Warning:
+        The gradient kernel of this operator does not yet developed.
+        If you need back propagation through this operator, please replace it with paddle.linalg.eig.
+
+    Args:
+        x (Tensor): A square matrix or a batch of square matrices whose eigenvalues will be computed.
+            Its shape should be `[*, M, M]`, where `*` is zero or more batch dimensions.
+            Its data type should be float32, float64, complex64, or complex128.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`.
+            The eigenvalues are complex-valued even when `x` is real.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.set_device("cpu")
+            paddle.seed(1234)
+
+            x = paddle.rand(shape=[3, 3], dtype='float64')
+            # [[0.02773777, 0.93004224, 0.06911496],
+            #  [0.24831591, 0.45733623, 0.07717843],
+            #  [0.48016702, 0.14235102, 0.42620817]])
+
+            print(paddle.linalg.eigvals(x))
+            # [(-0.27078833542132674+0j), (0.29962280156230725+0j), (0.8824477020120244+0j)] #complex128
+    """
+
+    check_variable_and_dtype(x, 'dtype',
+                             ['float32', 'float64', 'complex64',
+                              'complex128'], 'eigvals')
+
+    x_shape = list(x.shape)
+    if len(x_shape) < 2:
+        raise ValueError(
+            "The dimension of Input(x) should be at least 2, but received x's dimention = {}, x's shape = {}".
+            format(len(x_shape), x_shape))
+
+    if x_shape[-1] != x_shape[-2]:
+        raise ValueError(
+            "The last two dimensions of Input(x) should be equal, but received x's shape = {}".
+            format(x_shape))
+
+    if in_dygraph_mode():
+        return _C_ops.eigvals(x)
+
+    helper = LayerHelper('eigvals', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(type='eigvals', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
 def multi_dot(x, name=None):
     """
     Multi_dot is an operator that calculates multiple matrix multiplications.
 
-    Supports inputs of float, double and float16 dtypes. This function does not
+    Supports inputs of float16(only GPU support), float32 and float64 dtypes. This function does not
     support batched inputs.
 
     The input tensor in [x] must be 2-D except for the first and last can be 1-D.
@@ -1534,7 +1699,7 @@ def multi_dot(x, name=None):
         B_data = np.random.random([4, 5]).astype(np.float32)
         A = paddle.to_tensor(A_data)
         B = paddle.to_tensor(B_data)
-        out = paddle.multi_dot([A, B])
+        out = paddle.linalg.multi_dot([A, B])
         print(out.numpy().shape)
         # [3, 5]
 
@@ -1545,7 +1710,7 @@ def multi_dot(x, name=None):
         A = paddle.to_tensor(A_data)
         B = paddle.to_tensor(B_data)
         C = paddle.to_tensor(C_data)
-        out = paddle.multi_dot([A, B, C])
+        out = paddle.linalg.multi_dot([A, B, C])
         print(out.numpy().shape)
         # [10, 7]
 
@@ -1570,7 +1735,7 @@ def multi_dot(x, name=None):
 
 def eigh(x, UPLO='L', name=None):
     """
-    Compute the eigenvalues and eigenvectors of a 
+    Compute the eigenvalues and eigenvectors of a
     complex Hermitian (conjugate symmetric) or a real symmetric matrix.
 
     Args:
@@ -1594,7 +1759,7 @@ def eigh(x, UPLO='L', name=None):
 
             x_data = np.array([[1, -2j], [2j, 5]])
             x = paddle.to_tensor(x_data)
-            out_value, out_vector = paddle.eigh(x, UPLO='L')
+            out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
             print(out_value)
             #[0.17157288, 5.82842712]
             print(out_vector)
@@ -1615,7 +1780,7 @@ def __check_input(x, UPLO):
             raise ValueError(
                 "The input matrix must be batches of square matrices. But received x's dimention: {}".
                 format(x_shape))
-        if UPLO is not 'L' and UPLO is not 'U':
+        if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
                 "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
 
@@ -1635,3 +1800,332 @@ def __check_input(x, UPLO):
                  'Eigenvectors': out_vector},
         attrs={'UPLO': UPLO})
     return out_value, out_vector
+
+
+def pinv(x, rcond=1e-15, hermitian=False, name=None):
+    r"""
+    Calculate pseudo inverse via SVD(singular value decomposition)
+    of one matrix or batches of regular matrix.
+
+    .. math::
+
+        if hermitian == False:
+            x = u * s * vt  (SVD)
+            out = v * 1/s * ut
+        else:
+            x = u * s * ut  (eigh)
+            out = u * 1/s * u.conj().transpose(-2,-1)
+
+    If x is hermitian or symmetric matrix, svd will be replaced with eigh.
+
+    Args:
+        x(Tensor): The input tensor. Its shape should be (*, m, n)
+            where * is zero or more batch dimensions. m and n can be
+            arbitraty positive number. The data type of x should be
+            float32 or float64 or complex64 or complex128. When data
+            type is complex64 or cpmplex128, hermitian should be set
+            True.
+
+        rcond(Tensor, optional): the tolerance value to determine
+            when is a singular value zero. Defalut:1e-15.
+
+        hermitian(bool, optional): indicates whether x is Hermitian
+            if complex or symmetric if real. Default: False.
+
+        name(str|None): A name for this layer(optional). If set None,
+            the layer will be named automatically.
+
+    Returns:
+        Tensor: The tensor with same data type with x. it represents
+        pseudo inverse of x. Its shape should be (*, n, m).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.arange(15).reshape((3, 5)).astype('float64')
+            input = paddle.to_tensor(x)
+            out = paddle.linalg.pinv(input)
+            print(input)
+            print(out)
+
+            # input:
+            # [[0. , 1. , 2. , 3. , 4. ],
+            # [5. , 6. , 7. , 8. , 9. ],
+            # [10., 11., 12., 13., 14.]]
+
+            # out:
+            # [[-0.22666667, -0.06666667,  0.09333333],
+            # [-0.12333333, -0.03333333,  0.05666667],
+            # [-0.02000000,  0.00000000,  0.02000000],
+            # [ 0.08333333,  0.03333333, -0.01666667],
+            # [ 0.18666667,  0.06666667, -0.05333333]]
+
+            # one can verify : x * out * x = x ;
+            # or              out * x * out = x ;
+    """
+
+    if in_dygraph_mode():
+        if not hermitian:
+            # combine svd and matmul op
+            u, s, vt = _C_ops.svd(x, 'full_matrices', False)
+            max_singular_val = _C_ops.reduce_max(s, 'dim', [-1], 'keep_dim', True, \
+                'reduce_all', False)
+            rcond = paddle.to_tensor(rcond, dtype=x.dtype)
+            cutoff = rcond * max_singular_val
+            y = float('inf')
+            y = paddle.to_tensor(y, dtype=x.dtype)
+
+            condition = s > cutoff
+            cond_int = layers.cast(condition, s.dtype)
+            cond_not_int = layers.cast(layers.logical_not(condition), s.dtype)
+            out1 = layers.elementwise_mul(1 / s, cond_int)
+            out2 = layers.elementwise_mul(1 / y, cond_not_int)
+            singular = layers.elementwise_add(out1, out2)
+            st, _ = _C_ops.unsqueeze2(singular, 'axes', [-2])
+
+            dims = list(range(len(vt.shape)))
+            perm = dims[:-2] + [dims[-1]] + [dims[-2]]
+            v, _ = _C_ops.transpose2(vt, 'axis', perm)
+
+            out_1 = v * st
+            out_2 = _C_ops.matmul_v2(out_1, u, 'trans_x', False, 'trans_y',
+                                     True)
+            return out_2
+        else:
+            # combine eigh and matmul op
+            s, u = _C_ops.eigh(x, 'UPLO', 'L')
+            s_abs = paddle.abs(s)
+            max_singular_val = _C_ops.reduce_max(s_abs, 'dim', [-1], 'keep_dim', True, \
+                'reduce_all', False)
+            rcond = paddle.to_tensor(rcond, dtype=s.dtype)
+            cutoff = rcond * max_singular_val
+            y = float('inf')
+            y = paddle.to_tensor(y, dtype=s.dtype)
+
+            condition = s_abs > cutoff
+            cond_int = layers.cast(condition, s.dtype)
+            cond_not_int = layers.cast(layers.logical_not(condition), s.dtype)
+            out1 = layers.elementwise_mul(1 / s, cond_int)
+            out2 = layers.elementwise_mul(1 / y, cond_not_int)
+            singular = layers.elementwise_add(out1, out2)
+            st, _ = _C_ops.unsqueeze2(singular, 'axes', [-2])
+
+            out_1 = u * st
+            u_conj = _C_ops.conj(u)
+            out_2 = _C_ops.matmul_v2(out_1, u_conj, 'trans_x', False, 'trans_y',
+                                     True)
+            return out_2
+    else:
+        if not hermitian:
+            helper = LayerHelper('pinv', **locals())
+            dtype = x.dtype
+            check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pinv')
+
+            u = helper.create_variable_for_type_inference(dtype)
+            s = helper.create_variable_for_type_inference(dtype)
+            vt = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='svd',
+                inputs={'X': [x]},
+                outputs={'U': u,
+                         'VH': vt,
+                         'S': s},
+                attrs={'full_matrices': False}, )
+
+            max_singular_val = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='reduce_max',
+                inputs={'X': s},
+                outputs={'Out': max_singular_val},
+                attrs={'dim': [-1],
+                       'keep_dim': True,
+                       'reduce_all': False})
+
+            rcond = layers.fill_constant(shape=[1], value=rcond, dtype=dtype)
+            cutoff = rcond * max_singular_val
+            y = float('inf')
+            y = layers.fill_constant(shape=[1], value=y, dtype=dtype)
+
+            condition = s > cutoff
+            cond_int = layers.cast(condition, dtype)
+            cond_not_int = layers.cast(layers.logical_not(condition), dtype)
+            out1 = layers.elementwise_mul(1 / s, cond_int)
+            out2 = layers.elementwise_mul(1 / y, cond_not_int)
+            singular = layers.elementwise_add(out1, out2)
+
+            st = helper.create_variable_for_type_inference(dtype=dtype)
+            st_shape = helper.create_variable_for_type_inference(dtype=dtype)
+            helper.append_op(
+                type='unsqueeze2',
+                inputs={'X': singular},
+                attrs={'axes': [-2]},
+                outputs={'Out': st,
+                         'XShape': st_shape})
+
+            dims = list(range(len(vt.shape)))
+            perm = dims[:-2] + [dims[-1]] + [dims[-2]]
+            v = helper.create_variable_for_type_inference(dtype)
+            v_shape = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='transpose2',
+                inputs={'X': [vt]},
+                outputs={'Out': [v],
+                         'XShape': [v_shape]},
+                attrs={'axis': perm})
+
+            out_1 = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='elementwise_mul',
+                inputs={'X': v,
+                        'Y': st},
+                outputs={'Out': out_1},
+                attrs={'axis': -1,
+                       'use_mkldnn': False})
+            out_1 = helper.append_activation(out_1)
+
+            out_2 = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='matmul_v2',
+                inputs={'X': out_1,
+                        'Y': u},
+                outputs={'Out': out_2},
+                attrs={'trans_x': False,
+                       'trans_y': True}, )
+            return out_2
+        else:
+            helper = LayerHelper('pinv', **locals())
+            dtype = x.dtype
+            check_variable_and_dtype(
+                x, 'dtype', ['float32', 'float64', 'complex64',
+                             'complex128'], 'pinv')
+
+            if dtype == paddle.complex128:
+                s_type = 'float64'
+            elif dtype == paddle.complex64:
+                s_type = 'float32'
+            else:
+                s_type = dtype
+
+            u = helper.create_variable_for_type_inference(dtype)
+            s = helper.create_variable_for_type_inference(s_type)
+            helper.append_op(
+                type='eigh',
+                inputs={'X': x},
+                outputs={'Eigenvalues': s,
+                         'Eigenvectors': u},
+                attrs={'UPLO': 'L'})
+            s_abs = helper.create_variable_for_type_inference(s_type)
+            helper.append_op(
+                type='abs', inputs={'X': s}, outputs={'Out': s_abs})
+            max_singular_val = helper.create_variable_for_type_inference(s_type)
+            helper.append_op(
+                type='reduce_max',
+                inputs={'X': s_abs},
+                outputs={'Out': max_singular_val},
+                attrs={'dim': [-1],
+                       'keep_dim': True,
+                       'reduce_all': False})
+
+            rcond = layers.fill_constant(shape=[1], value=rcond, dtype=s_type)
+            cutoff = rcond * max_singular_val
+            y = float('inf')
+            y = layers.fill_constant(shape=[1], value=y, dtype=s_type)
+
+            condition = s_abs > cutoff
+            cond_int = layers.cast(condition, s_type)
+            cond_not_int = layers.cast(layers.logical_not(condition), s_type)
+            out1 = layers.elementwise_mul(1 / s, cond_int)
+            out2 = layers.elementwise_mul(1 / y, cond_not_int)
+            singular = layers.elementwise_add(out1, out2)
+
+            st = helper.create_variable_for_type_inference(dtype=s_type)
+            st_shape = helper.create_variable_for_type_inference(dtype=s_type)
+            helper.append_op(
+                type='unsqueeze2',
+                inputs={'X': singular},
+                attrs={'axes': [-2]},
+                outputs={'Out': st,
+                         'XShape': st_shape})
+
+            out_1 = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='elementwise_mul',
+                inputs={'X': u,
+                        'Y': st},
+                outputs={'Out': out_1},
+                attrs={'axis': -1,
+                       'use_mkldnn': False})
+            out_1 = helper.append_activation(out_1)
+
+            u_conj = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='conj', inputs={'X': u}, outputs={'Out': [u_conj]})
+
+            out_2 = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='matmul_v2',
+                inputs={'X': out_1,
+                        'Y': u_conj},
+                outputs={'Out': out_2},
+                attrs={'trans_x': False,
+                       'trans_y': True}, )
+            return out_2
+
+
+def solve(x, y, name=None):
+    r"""
+    Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
+    Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
+    a vector/matrix or a batch of vectors/matrices, the equation should be:
+
+    .. math::
+        Out = X^-1 * Y
+    Specifically,
+    - This system of linear equations has one solution if and only if input 'X' is invertible.
+
+    Args:
+        x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
+            more batch dimensions. Its data type should be float32 or float64.
+        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or
+            more batch dimensions. Its data type should be float32 or float64.
+        name(str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The solution of a square system of linear equations with a unique solution for input 'x' and 'y'.
+        Its data type should be the same as that of `x`.
+
+    Examples:
+    .. code-block:: python
+
+        # a square system of linear equations:
+        # 2*X0 + X1 = 9
+        # X0 + 2*X1 = 8
+
+        import paddle
+        import numpy as np
+
+        np_x = np.array([[3, 1],[1, 2]])
+        np_y = np.array([9, 8])
+        x = paddle.to_tensor(np_x, dtype="float64")
+        y = paddle.to_tensor(np_y, dtype="float64")
+        out = paddle.linalg.solve(x, y)
+
+        print(out)
+        # [2., 3.])
+    """
+    if in_dygraph_mode():
+        return _C_ops.solve(x, y)
+
+    inputs = {"X": [x], "Y": [y]}
+    helper = LayerHelper("solve", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'solve')
+    check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'solve')
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type="solve", inputs={"X": x,
+                              "Y": y}, outputs={"Out": out})
+    return out
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 30477d20e7518..4129a1060daf9 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -682,7 +682,7 @@ def roll(x, shifts, axis=None, name=None):
         axis = [axis]
 
     len_origin_shape = len(origin_shape)
-    if axis:
+    if axis is not None:
         for i in range(len(axis)):
             if axis[i] >= len_origin_shape or axis[i] < -len_origin_shape:
                 raise ValueError(
diff --git a/python/paddle/tensor/signal.py b/python/paddle/tensor/signal.py
new file mode 100644
index 0000000000000..86022a1748356
--- /dev/null
+++ b/python/paddle/tensor/signal.py
@@ -0,0 +1,576 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import paddle
+
+from .attribute import is_complex, is_floating_point
+from .fft import fft_r2c, fft_c2r, fft_c2c
+from ..fluid.data_feeder import check_variable_and_dtype
+from ..fluid.framework import in_dygraph_mode
+from ..fluid.layer_helper import LayerHelper
+from .. import _C_ops
+
+__all__ = [
+    'frame',
+    'overlap_add',
+    'stft',
+    'istft',
+]
+
+
+def frame(x, frame_length, hop_length, axis=-1, name=None):
+    """
+    Slice the N-dimensional (where N >= 1) input into (overlapping) frames.
+
+    Args:
+        x (Tensor): The input data which is a N-dimensional (where N >= 1) Tensor
+            with shape `[..., seq_length]` or `[seq_length, ...]`.
+        frame_length (int): Length of the frame and `0 < frame_length <= x.shape[axis]`.
+        hop_length (int): Number of steps to advance between adjacent frames
+            and `0 < hop_length`. 
+        axis (int, optional): Specify the axis to operate on the input Tensors. Its
+            value should be 0(the first dimension) or -1(the last dimension). If not
+            specified, the last axis is used by default. 
+
+    Returns:
+        The output frames tensor with shape `[..., frame_length, num_frames]` if `axis==-1`,
+            otherwise `[num_frames, frame_length, ...]` where
+        
+            `num_framse = 1 + (x.shape[axis] - frame_length) // hop_length`
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+        from paddle.tensor.signal import frame
+        
+        # 1D
+        x = paddle.arange(8)
+        y0 = frame(x, frame_length=4, hop_length=2, axis=-1)  # [4, 3]
+        # [[0, 2, 4],
+        #  [1, 3, 5],
+        #  [2, 4, 6],
+        #  [3, 5, 7]]
+
+        y1 = frame(x, frame_length=4, hop_length=2, axis=0)   # [3, 4]
+        # [[0, 1, 2, 3],
+        #  [2, 3, 4, 5],
+        #  [4, 5, 6, 7]]
+
+        # 2D
+        x0 = paddle.arange(16).reshape([2, 8])
+        y0 = frame(x0, frame_length=4, hop_length=2, axis=-1)  # [2, 4, 3]
+        # [[[0, 2, 4],
+        #   [1, 3, 5],
+        #   [2, 4, 6],
+        #   [3, 5, 7]],
+        #
+        #  [[8 , 10, 12],
+        #   [9 , 11, 13],
+        #   [10, 12, 14],
+        #   [11, 13, 15]]]
+
+        x1 = paddle.arange(16).reshape([8, 2])
+        y1 = frame(x1, frame_length=4, hop_length=2, axis=0)   # [3, 4, 2]
+        # [[[0 , 1 ],
+        #   [2 , 3 ],
+        #   [4 , 5 ],
+        #   [6 , 7 ]],
+        #
+        #   [4 , 5 ],
+        #   [6 , 7 ],
+        #   [8 , 9 ],
+        #   [10, 11]],
+        #
+        #   [8 , 9 ],
+        #   [10, 11],
+        #   [12, 13],
+        #   [14, 15]]]
+
+        # > 2D
+        x0 = paddle.arange(32).reshape([2, 2, 8])
+        y0 = frame(x0, frame_length=4, hop_length=2, axis=-1)  # [2, 2, 4, 3]
+
+        x1 = paddle.arange(32).reshape([8, 2, 2])
+        y1 = frame(x1, frame_length=4, hop_length=2, axis=0)   # [3, 4, 2, 2]
+    """
+    if axis not in [0, -1]:
+        raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')
+
+    if not isinstance(frame_length, int) or frame_length <= 0:
+        raise ValueError(
+            f'Unexpected frame_length: {frame_length}. It should be an positive integer.'
+        )
+
+    if not isinstance(hop_length, int) or hop_length <= 0:
+        raise ValueError(
+            f'Unexpected hop_length: {hop_length}. It should be an positive integer.'
+        )
+
+    if frame_length > x.shape[axis]:
+        raise ValueError(
+            f'Attribute frame_length should be less equal than sequence length, '
+            f'but got ({frame_length}) > ({x.shape[axis]}).')
+
+    op_type = 'frame'
+
+    if in_dygraph_mode():
+        attrs = ('frame_length', frame_length, 'hop_length', hop_length, 'axis',
+                 axis)
+        op = getattr(_C_ops, op_type)
+        out = op(x, *attrs)
+    else:
+        check_variable_and_dtype(
+            x, 'x', ['int32', 'int64', 'float16', 'float32',
+                     'float64'], op_type)
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+        helper.append_op(
+            type=op_type,
+            inputs={'X': x},
+            attrs={
+                'frame_length': frame_length,
+                'hop_length': hop_length,
+                'axis': axis
+            },
+            outputs={'Out': out})
+    return out
+
+
+def overlap_add(x, hop_length, axis=-1, name=None):
+    """
+    Reconstructs a tensor consisted of overlap added sequences from input frames.
+
+    Args:
+        x (Tensor): The input data which is a N-dimensional (where N >= 2) Tensor
+            with shape `[..., frame_length, num_frames]` or
+            `[num_frames, frame_length ...]`.
+        hop_length (int): Number of steps to advance between adjacent frames and
+            `0 < hop_length <= frame_length`. 
+        axis (int, optional): Specify the axis to operate on the input Tensors. Its
+            value should be 0(the first dimension) or -1(the last dimension). If not
+            specified, the last axis is used by default. 
+
+    Returns:
+        The output frames tensor with shape `[..., seq_length]` if `axis==-1`,
+            otherwise `[seq_length, ...]` where
+
+            `seq_length = (n_frames - 1) * hop_length + frame_length`
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+        from paddle.tensor.signal import overlap_add
+        
+        # 2D
+        x0 = paddle.arange(16).reshape([8, 2])
+        # [[0 , 1 ],
+        #  [2 , 3 ],
+        #  [4 , 5 ],
+        #  [6 , 7 ],
+        #  [8 , 9 ],
+        #  [10, 11],
+        #  [12, 13],
+        #  [14, 15]]
+        y0 = overlap_add(x0, hop_length=2, axis=-1)  # [10]
+        # [0 , 2 , 5 , 9 , 13, 17, 21, 25, 13, 15]
+
+        x1 = paddle.arange(16).reshape([2, 8])
+        # [[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ],
+        #  [8 , 9 , 10, 11, 12, 13, 14, 15]]
+        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10]
+        # [0 , 1 , 10, 12, 14, 16, 18, 20, 14, 15]
+
+        # > 2D
+        x0 = paddle.arange(32).reshape([2, 1, 8, 2])
+        y0 = overlap_add(x0, hop_length=2, axis=-1)  # [2, 1, 10]
+
+        x1 = paddle.arange(32).reshape([2, 8, 1, 2])
+        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2] 
+    """
+    if axis not in [0, -1]:
+        raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')
+
+    if not isinstance(hop_length, int) or hop_length <= 0:
+        raise ValueError(
+            f'Unexpected hop_length: {hop_length}. It should be an positive integer.'
+        )
+
+    op_type = 'overlap_add'
+
+    if in_dygraph_mode():
+        attrs = ('hop_length', hop_length, 'axis', axis)
+        op = getattr(_C_ops, op_type)
+        out = op(x, *attrs)
+    else:
+        check_variable_and_dtype(
+            x, 'x', ['int32', 'int64', 'float16', 'float32',
+                     'float64'], op_type)
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+        helper.append_op(
+            type=op_type,
+            inputs={'X': x},
+            attrs={'hop_length': hop_length,
+                   'axis': axis},
+            outputs={'Out': out})
+    return out
+
+
+def stft(x,
+         n_fft,
+         hop_length=None,
+         win_length=None,
+         window=None,
+         center=True,
+         pad_mode='reflect',
+         normalized=False,
+         onesided=True,
+         name=None):
+    """
+    Short-time Fourier transform (STFT).
+
+    The STFT computes the discrete Fourier transforms (DFT) of short overlapping
+    windows of the input using this formula:
+    
+    .. math::
+        X_t[\omega] = \sum_{n = 0}^{N-1}%
+                      \text{window}[n]\ x[t \times H + n]\ %
+                      e^{-{2 \pi j \omega n}/{N}}
+    
+    Where:
+    - :math:`t`: The :math:`t`-th input window.
+    - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
+        or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`. 
+    - :math:`N`: Value of `n_fft`.
+    - :math:`H`: Value of `hop_length`.  
+    
+    Args:
+        x (Tensor): The input data which is a 1-dimensional or 2-dimensional Tensor with
+            shape `[..., seq_length]`. It can be a real-valued or a complex Tensor.
+        n_fft (int): The number of input samples to perform Fourier transform.
+        hop_length (int, optional): Number of steps to advance between adjacent windows
+            and `0 < hop_length`. Default: `None`(treated as equal to `n_fft//4`)
+        win_length (int, optional): The size of window. Default: `None`(treated as equal
+            to `n_fft`)
+        window (Tensor, optional): A 1-dimensional tensor of size `win_length`. It will
+            be center padded to length `n_fft` if `win_length < n_fft`. Default: `None`(
+            treated as a rectangle window with value equal to 1 of size `win_length`).
+        center (bool, optional): Whether to pad `x` to make that the
+            :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. See
+            `paddle.nn.functional.pad` for all padding options. Default: `"reflect"`
+        normalized (bool, optional): Control whether to scale the output by `1/sqrt(n_fft)`.
+            Default: `False`
+        onesided (bool, optional): Control whether to return half of the Fourier transform
+            output that satisfies the conjugate symmetry condition when input is a real-valued
+            tensor. It can not be `True` if input is a complex tensor. Default: `True`
+        name (str, optional): The default value is None. Normally there is no need for user
+            to set this property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`(
+            real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
+            `onesided` is `False`)
+    
+    Exampels:
+        .. code-block:: python
+    
+            import paddle
+            from paddle.tensor.signal import stft
+    
+            # real-valued input
+            x = paddle.randn([8, 48000], dtype=paddle.float64)
+            y1 = stft(x, n_fft=512)  # [8, 257, 376]
+            y2 = stft(x, n_fft=512, onesided=False)  # [8, 512, 376]
+    
+            # complex input
+            x = paddle.randn([8, 48000], dtype=paddle.float64) + \
+                    paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
+            y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
+    """
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64', 'complex64', 'complex128'],
+        'stft')
+
+    x_rank = len(x.shape)
+    assert x_rank in [1, 2], \
+        f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
+
+    if x_rank == 1:  # (batch, seq_length)
+        x = x.unsqueeze(0)
+
+    if hop_length is None:
+        hop_length = int(n_fft // 4)
+
+    assert hop_length > 0, \
+        f'hop_length should be > 0, but got {hop_length}.'
+
+    if win_length is None:
+        win_length = n_fft
+
+    assert 0 < n_fft <= x.shape[-1], \
+        f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
+
+    assert 0 < win_length <= n_fft, \
+        f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
+
+    if window is not None:
+        assert len(window.shape) == 1 and len(window) == win_length, \
+            f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
+    else:
+        window = paddle.ones(shape=(win_length, ), dtype=x.dtype)
+
+    if win_length < n_fft:
+        pad_left = (n_fft - win_length) // 2
+        pad_right = n_fft - win_length - pad_left
+        window = paddle.nn.functional.pad(window,
+                                          pad=[pad_left, pad_right],
+                                          mode='constant')
+
+    if center:
+        assert pad_mode in ['constant', 'reflect'], \
+            'pad_mode should be "reflect" or "constant", but got "{}".'.format(pad_mode)
+
+        pad_length = n_fft // 2
+        # FIXME: Input `x` can be a complex tensor but pad does not supprt complex input.
+        x = paddle.nn.functional.pad(x.unsqueeze(-1),
+                                     pad=[pad_length, pad_length],
+                                     mode=pad_mode,
+                                     data_format="NLC").squeeze(-1)
+
+    x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1)
+    x_frames = x_frames.transpose(
+        perm=[0, 2,
+              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+    x_frames = x_frames * window
+
+    norm = 'ortho' if normalized else 'backward'
+    if is_complex(x_frames):
+        assert not onesided, \
+            'onesided should be False when input or window is a complex Tensor.'
+
+    if not is_complex(x):
+        out = fft_r2c(
+            x=x_frames,
+            n=None,
+            axis=-1,
+            norm=norm,
+            forward=True,
+            onesided=onesided,
+            name=name)
+    else:
+        out = fft_c2c(
+            x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name)
+
+    out = out.transpose(perm=[0, 2, 1])  # (batch, n_fft, num_frames)
+
+    if x_rank == 1:
+        out.squeeze_(0)
+
+    return out
+
+
+def istft(x,
+          n_fft,
+          hop_length=None,
+          win_length=None,
+          window=None,
+          center=True,
+          normalized=False,
+          onesided=True,
+          length=None,
+          return_complex=False,
+          name=None):
+    """
+    Inverse short-time Fourier transform (ISTFT).
+
+    Reconstruct time-domain signal from the giving complex input and window tensor when
+        nonzero overlap-add (NOLA) condition is met: 
+
+    .. math::
+        \sum_{t = -\infty}^{\infty}%
+            \text{window}^2[n - t \times H]\ \neq \ 0, \ \text{for } all \ n
+
+    Where:
+    - :math:`t`: The :math:`t`-th input window.
+    - :math:`N`: Value of `n_fft`.
+    - :math:`H`: Value of `hop_length`.
+
+    Result of `istft` expected to be the inverse of `paddle.tensor.signal.stft`, but it is
+        not guaranteed to reconstruct a exactly realizible time-domain signal from a STFT
+        complex tensor which has been modified (via masking or otherwise). Therefore, `istft`
+        gives the [Griffin-Lim optimal estimate](https://ieeexplore.ieee.org/document/1164317)
+        (optimal in a least-squares sense) for the corresponding signal.
+
+    Args:
+        x (Tensor): The input data which is a 2-dimensional or 3-dimensional **complesx**
+            Tensor with shape `[..., n_fft, num_frames]`. 
+        n_fft (int): The size of Fourier transform.
+        hop_length (int, optional): Number of steps to advance between adjacent windows
+            from time-domain signal and `0 < hop_length < win_length`. Default: `None`(
+            treated as equal to `n_fft//4`)
+        win_length (int, optional): The size of window. Default: `None`(treated as equal
+            to `n_fft`)
+        window (Tensor, optional): A 1-dimensional tensor of size `win_length`. It will
+            be center padded to length `n_fft` if `win_length < n_fft`. It should be a
+            real-valued tensor if `return_complex` is False. Default: `None`(treated as
+            a rectangle window with value equal to 1 of size `win_length`).
+        center (bool, optional): It means that whether the time-domain signal has been
+            center padded. Default: `True`.
+        normalized (bool, optional): Control whether to scale the output by `1/sqrt(n_fft)`.
+            Default: `False`
+        onesided (bool, optional): It means that whether the input STFT tensor is a half
+            of the conjugate symmetry STFT tensor transformed from a real-valued signal
+            and `istft` will return a real-valued tensor when it is set to `True`.
+            Default: `True`.
+        length (int, optional): Specify the length of time-domain signal. Default: `None`(
+            treated as the whole length of signal). 
+        return_complex (bool, optional): It means that whether the time-domain signal is
+            real-valued. If `return_complex` is set to `True`, `onesided` should be set to
+            `False` cause the output is complex. 
+        name (str, optional): The default value is None. Normally there is no need for user
+            to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A tensor of least squares estimation of the reconstructed signal(s) with shape
+            `[..., seq_length]`
+
+    Exampels:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            from paddle.tensor.signal import stft, istft
+
+            paddle.seed(0)
+
+            # STFT
+            x = paddle.randn([8, 48000], dtype=paddle.float64)
+            y = stft(x, n_fft=512)  # [8, 257, 376]
+
+            # ISTFT
+            x_ = istft(y, n_fft=512)  # [8, 48000]
+
+            np.allclose(x, x_)  # True
+    """
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')
+
+    x_rank = len(x.shape)
+    assert x_rank in [2, 3], \
+        'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(x_rank)
+
+    if x_rank == 2:  # (batch, n_fft, n_frames)
+        x = x.unsqueeze(0)
+
+    if hop_length is None:
+        hop_length = int(n_fft // 4)
+
+    if win_length is None:
+        win_length = n_fft
+
+    # Assure no gaps between frames.
+    assert 0 < hop_length <= win_length, \
+        'hop_length should be in (0, win_length({})], but got {}.'.format(win_length, hop_length)
+
+    assert 0 < win_length <= n_fft, \
+        'win_length should be in (0, n_fft({})], but got {}.'.format(n_fft, win_length)
+
+    n_frames = x.shape[-1]
+    fft_size = x.shape[-2]
+
+    if onesided:
+        assert (fft_size == n_fft // 2 + 1), \
+            'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
+    else:
+        assert (fft_size == n_fft), \
+            'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
+
+    if window is not None:
+        assert len(window.shape) == 1 and len(window) == win_length, \
+            'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
+    else:
+        window = paddle.ones(shape=(win_length, ))
+
+    if win_length < n_fft:
+        pad_left = (n_fft - win_length) // 2
+        pad_right = n_fft - win_length - pad_left
+        # FIXME: Input `window` can be a complex tensor but pad does not supprt complex input.
+        window = paddle.nn.functional.pad(window,
+                                          pad=[pad_left, pad_right],
+                                          mode='constant')
+
+    x = x.transpose(
+        perm=[0, 2,
+              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+    norm = 'ortho' if normalized else 'backward'
+
+    if return_complex:
+        assert not onesided, \
+            'onesided should be False when input(output of istft) or window is a complex Tensor.'
+
+        out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
+    else:
+        assert not is_complex(window), \
+            'Data type of window should not be complex when return_complex is False.'
+
+        if onesided is False:
+            x = x[:, :, :n_fft // 2 + 1]
+        out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
+
+    out = overlap_add(
+        x=(out * window).transpose(
+            perm=[0, 2, 1]),  # (batch, n_fft, num_frames)
+        hop_length=hop_length,
+        axis=-1)  # (batch, seq_length)
+
+    window_envelop = overlap_add(
+        x=paddle.tile(
+            x=window * window, repeat_times=[n_frames, 1]).transpose(
+                perm=[1, 0]),  # (n_fft, num_frames)
+        hop_length=hop_length,
+        axis=-1)  # (seq_length, )
+
+    if length is None:
+        if center:
+            out = out[:, (n_fft // 2):-(n_fft // 2)]
+            window_envelop = window_envelop[(n_fft // 2):-(n_fft // 2)]
+    else:
+        if center:
+            start = n_fft // 2
+        else:
+            start = 0
+
+        out = out[:, start:start + length]
+        window_envelop = window_envelop[start:start + length]
+
+    # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
+    if window_envelop.abs().min().item() < 1e-11:
+        raise ValueError(
+            'Abort istft because Nonzero Overlap Add (NOLA) condition failed. For more information about NOLA constraint please see `scipy.signal.check_NOLA`(https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.check_NOLA.html).'
+        )
+
+    out = out / window_envelop
+
+    if x_rank == 2:
+        out.squeeze_(0)
+
+    return out
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index b17bd70c91af2..b7f5ff28d6c74 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -54,8 +54,8 @@ def deprecated(update_to="", since="", reason="", level=0):
     """
 
     def decorator(func):
-        # TODO(zhiqiu): We temporally disable the warnings for 2.0-bata, and it should be re-enabled in the future.
-        # return func
+        # TODO(zhiqiu): temporally disable the warnings
+        return func
         """construct warning message, and return a decorated function or class."""
         assert isinstance(update_to, str), 'type of "update_to" must be str.'
         assert isinstance(since, str), 'type of "since" must be str.'
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 5326c0198add8..84dcdfa4cfcc4 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -32,6 +32,8 @@
     'decode_jpeg',
     'roi_pool',
     'RoIPool',
+    'psroi_pool',
+    'PSRoIPool',
 ]
 
 
@@ -904,6 +906,117 @@ def decode_jpeg(x, mode='unchanged', name=None):
     return out
 
 
+def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
+    """
+    Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
+    position-sensitive average pooling on regions of interest specified by input. It performs 
+    on inputs of nonuniform sizes to obtain fixed-size feature maps.
+
+    PSROIPooling is proposed by R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
+
+    Args:
+        x (Tensor): Input features with shape (N, C, H, W). The data type can be float32 or float64.
+        boxes (Tensor): Box coordinates of ROIs (Regions of Interest) to pool over. It should be
+                         a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...], 
+                         (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
+                         right coordinates.
+        boxes_num (Tensor): The number of boxes contained in each picture in the batch.
+        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type 
+                               is int32. If int, H and W are both equal to output_size.
+        spatial_scale (float): Multiplicative spatial scale factor to translate ROI coords from their 
+                               input scale to the scale used when pooling. Default: 1.0
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        4-D Tensor. The pooled ROIs with shape (num_rois, output_channels, pooled_h, pooled_w).
+        The output_channels equal to C / (pooled_h * pooled_w), where C is the channels of input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.uniform([2, 490, 28, 28], dtype='float32')
+            boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
+            boxes_num = paddle.to_tensor([1, 2], dtype='int32')
+            pool_out = paddle.vision.ops.psroi_pool(x, boxes, boxes_num, 7, 1.0)
+    """
+
+    check_type(output_size, 'output_size', (int, tuple, list), 'psroi_pool')
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+    pooled_height, pooled_width = output_size
+    assert (len(x.shape) == 4,
+            "Input features with shape should be (N, C, H, W)")
+    output_channels = int(x.shape[1] / (pooled_height * pooled_width))
+    if in_dygraph_mode():
+        return core.ops.psroi_pool(x, boxes, boxes_num, "output_channels",
+                                   output_channels, "spatial_scale",
+                                   spatial_scale, "pooled_height",
+                                   pooled_height, "pooled_width", pooled_width)
+
+    helper = LayerHelper('psroi_pool', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='psroi_pool',
+        inputs={'X': x,
+                'ROIs': boxes},
+        outputs={'Out': out},
+        attrs={
+            'output_channels': output_channels,
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width
+        })
+    return out
+
+
+class PSRoIPool(Layer):
+    """
+    This interface is used to construct a callable object of the ``PSRoIPool`` class. Please
+    refer to :ref:`api_paddle_vision_ops_psroi_pool`.
+
+    Args:
+        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type 
+                               is int32. If int, H and W are both equal to output_size.
+        spatial_scale (float): Multiplicative spatial scale factor to translate ROI coords from their 
+                               input scale to the scale used when pooling. Default: 1.0.
+
+    Shape:
+        - x: 4-D Tensor with shape (N, C, H, W).
+        - boxes: 2-D Tensor with shape (num_rois, 4).
+        - boxes_num: 1-D Tensor.
+        - output: 4-D tensor with shape (num_rois, output_channels, pooled_h, pooled_w).
+              The output_channels equal to C / (pooled_h * pooled_w), where C is the channels of input.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0)
+            x = paddle.uniform([2, 490, 28, 28], dtype='float32')
+            boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
+            boxes_num = paddle.to_tensor([1, 2], dtype='int32')
+            pool_out = psroi_module(x, boxes, boxes_num)
+
+    """
+
+    def __init__(self, output_size, spatial_scale=1.0):
+        super(PSRoIPool, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+
+    def forward(self, x, boxes, boxes_num):
+        return psroi_pool(x, boxes, boxes_num, self.output_size,
+                          self.spatial_scale)
+
+
 def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     """
     This operator implements the roi_pooling layer.
@@ -988,7 +1101,7 @@ class RoIPool(Layer):
 
     Args:
         output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
-        spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0  .
+        spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0.
 
     Returns:
         pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].  
@@ -998,7 +1111,7 @@ class RoIPool(Layer):
 
             import paddle
             from paddle.vision.ops import RoIPool
-
+            
             data = paddle.rand([1, 256, 32, 32])
             boxes = paddle.rand([3, 4])
             boxes[:, 2] += boxes[:, 0] + 3
diff --git a/python/setup.py.in b/python/setup.py.in
index 1b2897f230fbe..b10d5df541f2f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -306,6 +306,19 @@ package_data['paddle.libs']= []
 package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 
+package_data['paddle.libs']+=[
+    os.path.basename('${LAPACK_LIB}'), 
+    os.path.basename('${BLAS_LIB}'),
+    os.path.basename('${GFORTRAN_LIB}'),
+    os.path.basename('${GNU_RT_LIB_1}')]
+shutil.copy('${BLAS_LIB}', libs_path)
+shutil.copy('${LAPACK_LIB}', libs_path)
+shutil.copy('${GFORTRAN_LIB}', libs_path)
+shutil.copy('${GNU_RT_LIB_1}', libs_path)
+if not sys.platform.startswith("linux"):
+    package_data['paddle.libs']+=[os.path.basename('${GNU_RT_LIB_2}')]
+    shutil.copy('${GNU_RT_LIB_2}', libs_path)
+
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
@@ -325,6 +338,12 @@ else:
 if '${WITH_LITE}' == 'ON':
     shutil.copy('${LITE_SHARED_LIB}', libs_path)
     package_data['paddle.libs']+=['libpaddle_full_api_shared' + ext_name]
+    if '${LITE_WITH_NNADAPTER}' == 'ON':
+        shutil.copy('${LITE_NNADAPTER_LIB}', libs_path)
+        package_data['paddle.libs']+=['libnnadapter' + ext_name]
+        if '${NNADAPTER_WITH_HUAWEI_ASCEND_NPU}' == 'ON':
+            shutil.copy('${LITE_NNADAPTER_NPU_LIB}', libs_path)
+            package_data['paddle.libs']+=['libnnadapter_driver_huawei_ascend_npu' + ext_name]
 
 if '${WITH_PSLIB}' == 'ON':
     shutil.copy('${PSLIB_LIB}', libs_path)
@@ -385,13 +404,15 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
     if os.name != 'nt':
         # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
         if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+            commands = ["install_name_tool -id '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
+            commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
         else:
-            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+            commands = ["patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
         # The sw_64 not suppot patchelf, so we just disable that.
         if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
-          if os.system(command) != 0:
-              raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
+            for command in commands:
+                if os.system(command) != 0:
+                    raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
 
 ext_modules = [Extension('_foo', ['stub.cc'])]
 if os.name == 'nt':
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 8fd1be69a3d7f..af2203316d8b3 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -5,7 +5,8 @@ mock
 gym
 opencv-python<=4.2.0.32
 visualdl
-paddle2onnx>=0.4
-scipy
+paddle2onnx>=0.8.2
+scipy>=1.6
 prettytable
 distro
+numpy>=1.20
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 5943e997bdfa2..760bc2b168475 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -41,13 +41,13 @@ function add_failed(){
 api_params_diff=`python ${PADDLE_ROOT}/tools/check_api_compatible.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec` 
 api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api` 
 if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
-    echo_line="You must have one RD (XiaoguangHu01 or lanxianghit) approval for API change.\n"
+    echo_line="You must have one RD (XiaoguangHu01, lanxianghit or Superjomn) approval for API change.\n"
     echo_line="${echo_line} and one TPM approval for API change: \n"
     echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, Heeenrrry/LiKunLun, TCChenlong/ChenLong for general APIs\n"
     echo_line="${echo_line} PangHua/XiangHui for distributed related APIs\n"
     echo_line="${echo_line} twismon/WangYunKai, CheQiXiao/CheQiXiao for inference related APIs.\n"
 
-    check_approval 1 46782768 47554610
+    check_approval 1 46782768 47554610 328693
     check_approval 1 29231 23093488 28379894 11935832 2682285 12050047 50894398
 fi
 
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index f65a65c8b75b1..53b5cb9a722c4 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -88,6 +88,12 @@ function run_tools_test() {
     cd ${CUR_PWD}
 }
 
+changed_env_var_count=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/paddle | grep 'DEFINE_EXPORTED' | wc -l`
+if [[ $changed_env_var_count -gt 0 ]]; then
+    echo_line="You must have one RD (lanxianghit (Recommend), phlrain or luotao1) approval for changing the FLAGS, which manages the environment variables.\n"
+    check_approval 1 6836917 47554610 43953930
+fi
+
 if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
     echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
     check_approval 1 38231817
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index b5e12d6f96ddc..54e8d608ac67d 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -528,7 +528,7 @@
     'test_trunc_op', 'test_bernoulli_op', 'test_custom_relu_model',
     'test_backward', 'test_conv3d_transpose_part2_op', 'test_complex_transpose',
     'test_memory_reuse_exclude_feed_var', 'test_polygon_box_transform',
-    'math_function_gpu_test', 'test_program_prune_backward',
+    'math_function_gpu_test', 'test_program_prune_backward', 'test_ema_fleet',
     'test_fleet_amp_init', 'test_normalize', 'test_correlation',
     'test_conv_elementwise_add2_act_fuse_pass',
     'test_imperative_container_layerlist', 'test_dequantize_abs_max_op',
@@ -1324,6 +1324,7 @@
     'test_slice_op',
     'test_cond',
     'test_ema',
+    'test_ema_fleet',
     'test_nan_inf',
     'test_isinstance',
     'test_box_clip_op',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 5fa3a25f4caf0..7d0a2a8953fc8 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -173,6 +173,7 @@
     'test_elementwise_nn_grad',
     'test_elementwise_pow_op',
     'test_ema',
+    'test_ema_fleet',
     'test_embedding_id_stop_gradient',
     'test_empty_like_op',
     'test_entry_attr',
@@ -609,6 +610,7 @@
     'test_dequantize_mkldnn_op',
     'test_elementwise_add_mkldnn_op',
     'test_elementwise_add_bf16_mkldnn_op',
+    'test_elementwise_sub_mkldnn_op',
     'test_elementwise_mul_mkldnn_op',
     'test_elementwise_mul_bf16_mkldnn_op',
     'test_fc_mkldnn_op',
diff --git a/tools/test_ci_model_benchmark.sh b/tools/test_ci_model_benchmark.sh
index ba4c1f6b7f69a..fb842fbcd7689 100644
--- a/tools/test_ci_model_benchmark.sh
+++ b/tools/test_ci_model_benchmark.sh
@@ -26,14 +26,14 @@ function check_whl {
     unzip -q build/python/dist/*.whl -d /tmp/pr
     rm -f build/python/dist/*.whl && rm -f build/python/build/.timestamp
 
-    git checkout .
-    git checkout -b develop_base_pr upstream/$BRANCH
-    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
     rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
     if [ ${cmake_change} ];then
         rm -rf ${PADDLE_ROOT}/build/third_party
     fi
+    git checkout .
+    git checkout -b develop_base_pr upstream/$BRANCH
+    [ $? -ne 0 ] && echo "checkout paddle branch failed." && exit 1
 
     bash -x paddle/scripts/paddle_build.sh build_only
     [ $? -ne 0 ] && echo "build paddle failed." && exit 1